Merge remote-tracking branch 'ups/develop' into fea/jitkernel

77fc42d2 · tensor-tang · 2937314d · 7a5f3f75 · 77fc42d2 · 77fc42d2
142 changed file
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -52,6 +52,7 @@ ExternalProject_Add(
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          ${CMAKE_ARGS_PREFIX}
+                        -DUSE_LOGGER=YES
                        -DUSE_X86_PLACE=YES
                        -DBUILD_WITH_UNIT_TEST=NO
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -27,7 +27,6 @@ endfunction()

 CheckCompilerCXX11Flag()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
@@ -71,6 +70,20 @@ macro(safe_set_nvflag flag_name)
    endif()
 endmacro()

+macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
+    if (BUILD_SHARED_LIBS) 
+        return() # if build shared libs, the flags keep same with '/MD'
+    endif(BUILD_SHARED_LIBS)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()

 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
@@ -97,9 +110,13 @@ SET(CMAKE_EXTRA_INCLUDE_FILES "")

 # Common flags. the compiler flag used for C/C++ sources whenever release or debug
 # Do not care if this flag is support for gcc.
+
+# https://github.com/PaddlePaddle/Paddle/issues/12773
+if (NOT WIN32)
 set(COMMON_FLAGS
    -fPIC
    -fno-omit-frame-pointer
+    -Werror
    -Wall
    -Wextra
    -Wnon-virtual-dtor
@@ -114,11 +131,6 @@ set(COMMON_FLAGS
    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
 )

-# https://github.com/PaddlePaddle/Paddle/issues/12773
-if (NOT WIN32)
-list(APPEND COMMON_FLAGS -Werror)
-endif()
-
 set(GPU_COMMON_FLAGS
    -fPIC
    -fno-omit-frame-pointer
@@ -133,30 +145,53 @@ set(GPU_COMMON_FLAGS
    -Wno-error=array-bounds # Warnings in Eigen::array
 )

+else(NOT WIN32)
+set(COMMON_FLAGS
+    "/w") #disable all warnings.
+set(GPU_COMMON_FLAGS
+    "/w") #disable all warnings
+endif(NOT WIN32)
+
 if (APPLE)
    if(NOT CMAKE_CROSSCOMPILING)
        # On Mac OS X build fat binaries with x86_64 architectures by default.
        set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
    endif()
-else()
+endif(APPLE)
+
+if(LINUX)
    set(GPU_COMMON_FLAGS
        -Wall
        -Wextra
        -Werror
        ${GPU_COMMON_FLAGS})
-endif()
+endif(LINUX)

 if(UNIX AND NOT APPLE)
  # except apple from nix*Os family
  set(LINUX TRUE)
 endif(UNIX AND NOT APPLE)

-
 foreach(flag ${COMMON_FLAGS})
    safe_set_cflag(CMAKE_C_FLAGS ${flag})
    safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
+
 endforeach()

 foreach(flag ${GPU_COMMON_FLAGS})
    safe_set_nvflag(${flag})
 endforeach()
+
+if(WIN32)
+# windows build turn off warnings.
+safe_set_static_flag()
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/W3")
+        string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/W3")
+    endforeach(flag_var)
+endif(WIN32)
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
 add_custom_target(paddle_apis ALL
-                  DEPENDS paddle_v2_apis paddle_fluid_apis)
+                  DEPENDS paddle_v2_apis)

 add_custom_target(paddle_docs ALL
                  DEPENDS paddle_v2_docs paddle_v2_docs_cn
-                  paddle_fluid_docs paddle_fluid_docs_cn
                  paddle_mobile_docs paddle_mobile_docs_cn)

 add_subdirectory(v2)
-add_subdirectory(fluid)
 add_subdirectory(mobile)
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@@ -102,8 +102,8 @@ class Float16Transpiler:
                continue
            for input_arg in current_op.input_arg_names:
                if input_arg in self.input_map:
-                    current_op.rename_input(input_arg,
-                                            self.input_map[input_arg])
+                    current_op._rename_input(input_arg,
+                                             self.input_map[input_arg])

    def _remove_unused_var(self):
        '''
@@ -187,7 +187,7 @@ class Float16Transpiler:
                    shape=var.shape,
                    persistable=var.persistable)
                find_op(var)
-                var.op.rename_output(var_name, tmp_var_name)
+                var.op._rename_output(var_name, tmp_var_name)
                self.block._insert_op(
                    i,
                    type="cast",

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -6,26 +6,9 @@ paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=
 paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
-paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.output ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
-paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@@ -38,7 +21,7 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en
 paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
-paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
+paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False))
 paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
 paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
@@ -170,6 +153,13 @@ paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_
 paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
 paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
 paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0))
+paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32', False))
+paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32'))
+paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32'))
+paddle.fluid.layers.sum ArgSpec(args=['x', 'use_mkldnn'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -241,13 +231,6 @@ paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwarg
 paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.sampling_id ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -286,7 +269,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
-paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
+paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -315,13 +298,18 @@ paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs
 paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
 paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000))
+paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
+paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
-paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
+paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False))
 paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -13,3 +13,5 @@ if(WITH_INFERENCE)
  # NOTE: please add subdirectory inference at last.
  add_subdirectory(inference)
 endif()
+
+add_subdirectory(train)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -56,9 +56,9 @@ else()
  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
 if (NOT WIN32)
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
+  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
 else()
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
+  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
 endif (NOT WIN32)

 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
@@ -141,20 +141,22 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)

 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)

+cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+
 if(WITH_DISTRIBUTE)
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass elementwise_add_op)
 endif()
 
 if (NOT WIN32)
-  cc_library(parallel_executor SRCS parallel_executor.cc DEPS
-          threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
-          graph graph_viz_pass multi_devices_graph_pass
-          multi_devices_graph_print_pass multi_devices_graph_check_pass
-          fast_threaded_ssa_graph_executor fuse_elewise_add_act_pass)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS
+        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
+        graph build_strategy
+        fast_threaded_ssa_graph_executor)
 endif() # NOT WIN32

 cc_library(prune SRCS prune.cc DEPS framework_proto)

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -54,3 +54,8 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
 #        device_context reduce_op_handle )
 cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
+
+cc_library(build_strategy SRCS build_strategy.cc DEPS
+        graph_viz_pass multi_devices_graph_pass
+        multi_devices_graph_print_pass multi_devices_graph_check_pass
+        fuse_elewise_add_act_pass)
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/details/build_strategy.h"
+
+#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ParallelExecutorPassBuilder : public ir::PassBuilder {
+ public:
+  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
+      : ir::PassBuilder(), strategy_(strategy) {
+    // Add a graph viz pass to record a graph.
+    if (!strategy_.debug_graphviz_path_.empty()) {
+      auto viz_pass = AppendPass("graph_viz_pass");
+      const std::string graph_path = string::Sprintf(
+          "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
+      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    }
+
+    // Add op fusion.
+    if (strategy.fuse_elewise_add_act_ops_) {
+      auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
+      // Add a graph viz pass to record a graph.
+      if (!strategy.debug_graphviz_path_.empty()) {
+        auto viz_pass = AppendPass("graph_viz_pass");
+        const std::string graph_path = string::Sprintf(
+            "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
+        viz_pass->Set<std::string>("graph_viz_path",
+                                   new std::string(graph_path));
+      }
+    }
+
+    // Convert graph to run on multi-devices.
+    auto multi_devices_pass = AppendPass("multi_devices_pass");
+    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
+                                                         &strategy_);
+
+    // Add a graph print pass to record a graph with device info.
+    if (!strategy_.debug_graphviz_path_.empty()) {
+      auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
+      multi_devices_print_pass->SetNotOwned<const std::string>(
+          "debug_graphviz_path", &strategy_.debug_graphviz_path_);
+      multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
+          "graph_printer", new details::GraphvizSSAGraphPrinter);
+    }
+
+    // Verify that the graph is correct for multi-device executor.
+    AppendPass("multi_devices_check_pass");
+  }
+
+ private:
+  BuildStrategy strategy_;
+};
+
+std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy()
+    const {
+  pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
+  return pass_builder_;
+}
+
+std::unique_ptr<ir::Graph> BuildStrategy::Apply(
+    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &param_names,
+    const std::vector<Scope *> &local_scopes,
+#ifdef PADDLE_WITH_CUDA
+    const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
+#else
+    const bool use_cuda) const {
+#endif
+  // Create a default one if not initialized by user.
+  if (!pass_builder_) {
+    CreatePassesFromStrategy();
+  }
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
+
+  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
+    if (pass->Type() == "multi_devices_pass") {
+      pass->Erase("places");
+      pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
+      pass->Erase("loss_var_name");
+      pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
+      pass->Erase("params");
+      pass->SetNotOwned<const std::unordered_set<std::string>>("params",
+                                                               &param_names);
+      pass->Erase("local_scopes");
+      pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+                                                    &local_scopes);
+#ifdef PADDLE_WITH_CUDA
+      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+      pass->Erase("nccl_ctxs");
+      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
+#endif
+    }
+    graph = pass->Apply(std::move(graph));
+  }
+  return graph;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(fuse_elewise_add_act_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(multi_devices_pass);
+USE_PASS(multi_devices_check_pass);
+USE_PASS(multi_devices_print_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -15,6 +15,17 @@
 #pragma once

 #include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/pass_builder.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif

 namespace paddle {
 namespace framework {
@@ -57,6 +68,30 @@ struct BuildStrategy {
  bool fuse_elewise_add_act_ops_{false};

  bool enable_data_balance_{false};
+
+  // User normally doesn't need to call this API.
+  // The PassBuilder allows for more customized insert, remove of passes
+  // from python side.
+  // A new PassBuilder is created based on configs defined above and
+  // passes are owned by the PassBuilder.
+  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const;
+
+  // Apply the passes built by the pass_builder_. The passes will be
+  // applied to the Program and output an ir::Graph.
+  std::unique_ptr<ir::Graph> Apply(
+      const ProgramDesc &main_program,
+      const std::vector<platform::Place> &places,
+      const std::string &loss_var_name,
+      const std::unordered_set<std::string> &param_names,
+      const std::vector<Scope *> &local_scopes,
+#ifdef PADDLE_WITH_CUDA
+      const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const;
+#else
+      const bool use_cuda) const;
+#endif
+
+ private:
+  mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
 };

 }  // namespace details

--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
@@ -20,79 +20,37 @@ namespace paddle {
 namespace framework {
 namespace details {

-// Change it to thread safe flags if needed.
-class ThreadUnsafeOwnershipFlags {
+template <class T>
+class COWPtr {
 public:
-  explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
-
-  ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
-  ThreadUnsafeOwnershipFlags& operator=(
-      const ThreadUnsafeOwnershipFlags& other) = delete;
-  ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
-
-  void SetOwnership(bool flag) { flag_ = flag; }
-
-  // Invoke the callback if it is not owned.
-  template <typename Callback>
-  void AcquireOwnershipOnce(Callback acquire) {
-    if (!flag_) {
-      acquire();
-      flag_ = true;
-    }
-  }
+  typedef std::shared_ptr<T> RefPtr;

 private:
-  bool flag_;
-};
+  RefPtr m_sp;

-// Copy-On-Write pointer.
-// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
-//
-// The template parameter OwnershipFlags should have:
-//   * a constructor takes a bool. True if own.
-//   * SetOwnership(bool flag).
-//   * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
-//     owned.
-//
-// https://en.wikipedia.org/wiki/Copy-on-write
-template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
-class COWPtr {
 public:
-  // Ctor from raw pointer.
-  explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
+  COWPtr() : m_sp(nullptr) {}
+  explicit COWPtr(T* t) : m_sp(t) {}

-  // Move methods. Steal ownership from origin
-  COWPtr(COWPtr&& other)
-      : payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
-  COWPtr& operator=(COWPtr&& origin) = default;
+  const T& Data() const { return *m_sp; }

-  // Copy methods. Not own payload
-  COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
-  COWPtr& operator=(const COWPtr& other) {
-    payload_ = other.payload_;
-    ownership_.SetOwnership(false);
-    return *this;
-  }
-
-  // Access read only data.
-  const T& Data() const { return *payload_; }
-
-  // Access mutable data. If the data is not owned, the data will be copied
-  // before.
  T* MutableData() {
-    ownership_.AcquireOwnershipOnce(
-        [this] { payload_.reset(new T(*payload_)); });
-    return payload_.get();
+    DetachIfNotUnique();
+    return m_sp.get();
  }

- private:
-  // Actual data pointer.
-  std::shared_ptr<T> payload_;
+  void DetachIfNotUnique() {
+    T* tmp = m_sp.get();
+    if (!(tmp == nullptr || m_sp.unique())) {
+      Detach();
+    }
+  }

-  // Ownership flag.
-  OwnershipFlags ownership_;
+  void Detach() {
+    T* tmp = m_sp.get();
+    m_sp = RefPtr(new T(*tmp));
+  }
 };
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/cow_ptr_test.cc
+++ b/paddle/fluid/framework/details/cow_ptr_test.cc
@@ -30,6 +30,14 @@ TEST(COWPtr, all) {
  ASSERT_EQ(ptr2.Data(), 10);
 }

+TEST(COWPtr, change_old) {
+  COWPtr<int> ptr(new int{0});
+  COWPtr<int> ptr2 = ptr;
+  *ptr.MutableData() = 10;
+  ASSERT_EQ(ptr2.Data(), 0);
+  ASSERT_EQ(ptr.Data(), 10);
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -28,9 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(fc_fuse_pass inference)
-if(WITH_MKLDNN)
-  pass_library(conv_relu_mkldnn_fuse_pass inference)
-endif()
+if (WITH_MKLDNN)
+    pass_library(conv_relu_mkldnn_fuse_pass inference)
+endif ()
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
 pass_library(fc_lstm_fuse_pass inference)
@@ -41,12 +41,14 @@ cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass

 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")

+cc_library(pass_builder SRCS pass_builder.cc DEPS pass)
+
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
-if(WITH_MKLDNN)
-  cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
-endif()
+if (WITH_MKLDNN)
+    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
+endif ()
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -257,6 +257,22 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  PDPattern external_pattern, subblock_pattern;

+  // Use the following variables to tell whether this model is RNN1.
+  // This fuse can only works on the RNN1 model.
+  std::unordered_set<std::string> specified_vars({"data_lod_attention",
+                                                  "cell_init", "hidden_init",
+                                                  "data", "week", "minute"});
+  int count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsVar() && specified_vars.count(node->Name())) {
+      ++count;
+    }
+  }
+  if (count < specified_vars.size()) {
+    return graph;
+  }
+
+  // Continue to fuse.
  FindWhileOp(graph.get());
  return graph;
 }

--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -77,10 +77,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
    const std::string BatchedCellPreAct =
        patterns::UniqueKey("BatchedCellPreAct");
    const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
+    const std::string CheckedCell = patterns::UniqueKey("CheckedCell");

    scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
    scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
    scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
+    scope->Var(CheckedCell)->GetMutable<framework::LoDTensor>();

    op_desc.SetInput("H0", {});
    op_desc.SetInput("C0", {});
@@ -90,6 +92,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
    op_desc.SetOutput("BatchedGate", {BatchedGate});
    op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
    op_desc.SetOutput("BatchedInput", {BatchedInput});
+    op_desc.SetOutput("CheckedCell", {CheckedCell});
    op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
    op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
    // TODO(TJ): get from attr

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include <algorithm>
+#include <deque>
 #include <unordered_set>

-#include "paddle/fluid/framework/ir/graph_helper.h"
-
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -113,6 +113,74 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
  return adj_list;
 }

+size_t GraphNum(const Graph &graph) {
+  std::unordered_set<ir::Node *> nodes = graph.Nodes();
+  std::unordered_set<ir::Node *> visited_nodes;
+  visited_nodes.reserve(nodes.size());
+  std::deque<ir::Node *> q_nodes;
+  std::vector<std::unordered_set<ir::Node *>> graph_nodes;
+  std::unordered_set<ir::Node *> g_nodes;
+  size_t graph_count = 0;
+
+  auto traverse_nodes = [&visited_nodes,
+                         &q_nodes](const std::vector<ir::Node *> &nodes) {
+    std::copy_if(
+        nodes.begin(), nodes.end(), std::back_inserter(q_nodes),
+        [&visited_nodes](Node *node) { return !visited_nodes.count(node); });
+  };
+
+  while (visited_nodes.size() != nodes.size()) {
+    if (!q_nodes.empty()) {
+      auto cur_node = q_nodes.front();
+      q_nodes.pop_front();
+      visited_nodes.insert(cur_node);
+      g_nodes.insert(cur_node);
+      traverse_nodes(cur_node->inputs);
+      traverse_nodes(cur_node->outputs);
+    } else {
+      ++graph_count;
+      if (g_nodes.size()) {
+        graph_nodes.emplace_back(g_nodes);
+      }
+      g_nodes.clear();
+      for (auto &n : nodes) {
+        if (visited_nodes.count(n) == 0) {
+          q_nodes.push_back(n);
+          break;
+        }
+      }
+    }
+  }
+
+  if (g_nodes.size()) {
+    graph_nodes.emplace_back(g_nodes);
+  }
+
+  if (VLOG_IS_ON(10)) {
+    VLOG(10) << "graph_num: " << graph_nodes.size();
+    for (auto &g_n : graph_nodes) {
+      VLOG(10) << "graph_nodes: " << g_n.size();
+      if (g_n.size() < 10) {
+        std::stringstream out;
+        for (auto &node : g_n) {
+          out << "\nNode: " << node->Name() << " in [";
+          for (auto &n : node->inputs) {
+            out << n->Name() << ", ";
+          }
+          out << "], out[";
+          for (auto &n : node->outputs) {
+            out << n->Name() << ", ";
+          }
+          out << "]";
+        }
+        VLOG(10) << out.str();
+      }
+    }
+  }
+
+  return graph_count;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -27,6 +27,8 @@ namespace ir {
 // Test if the graph contains circle.
 bool HasCircle(const Graph &graph);

+size_t GraphNum(const Graph &graph);
+
 // Topology Sort the operations in the graph from inputs to outputs.
 // `graph` cannot contain circle.
 std::vector<ir::Node *> TopologySortOperations(const Graph &graph);

--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -120,6 +120,97 @@ TEST(GraphHelperTest, Basic) {
  ASSERT_EQ(node_map.at("op2"), 1UL);
  ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
 }
+
+void BuildZeroGraph(Graph* g) {}
+
+void BuildOneGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+
+void BuildTwoGraphs(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  //  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  //  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  //  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  //  v4->outputs.push_back(o5);
+}
+
+TEST(GraphHelperTest, GraphNum) {
+  ProgramDesc prog;
+
+  Graph g(prog);
+  BuildZeroGraph(&g);
+  ASSERT_EQ(GraphNum(g), 0);
+
+  Graph g2(prog);
+  BuildOneGraph(&g2);
+  ASSERT_EQ(GraphNum(g2), 1);
+
+  Graph g3(prog);
+  BuildTwoGraphs(&g3);
+  ASSERT_EQ(GraphNum(g3), 2);
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -14,6 +14,8 @@

 #include "paddle/fluid/framework/ir/graph_traits.h"

+#include <vector>
+
 namespace paddle {
 namespace framework {
 namespace ir {

--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -19,7 +19,6 @@ namespace paddle {
 namespace framework {
 namespace ir {
 std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
-  PADDLE_ENFORCE(!applied_, "Pass can only Apply() once.");
  PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
  for (const std::string& attr : required_pass_attrs_) {
    PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),

--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -42,6 +42,8 @@ class Pass {
    attr_dels_.clear();
  }

+  std::string Type() const { return type_; }
+
  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const;

  // Get a reference to the attributed previously set.
@@ -52,6 +54,21 @@ class Pass {
    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
  }

+  bool Has(const std::string &attr_name) const {
+    return attrs_.find(attr_name) != attrs_.end();
+  }
+
+  void Erase(const std::string &attr_name) {
+    if (!Has(attr_name)) {
+      return;
+    }
+    if (attr_dels_.find(attr_name) != attr_dels_.end()) {
+      attr_dels_[attr_name]();
+      attr_dels_.erase(attr_name);
+    }
+    attrs_.erase(attr_name);
+  }
+
  // Set a pointer to the attribute. Pass takes ownership of the attribute.
  template <typename AttrType>
  void Set(const std::string &attr_name, AttrType *attr) {
@@ -68,13 +85,15 @@ class Pass {
  // should delete the attribute.
  template <typename AttrType>
  void SetNotOwned(const std::string &attr_name, AttrType *attr) {
-    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
+                   attr_name);
    attrs_[attr_name] = attr;
  }

 protected:
-  virtual std::unique_ptr<Graph> ApplyImpl(
-      std::unique_ptr<Graph> graph) const = 0;
+  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
+    LOG(FATAL) << "Calling virtual Pass not implemented.";
+  }

 private:
  template <typename PassType>
@@ -89,7 +108,10 @@ class Pass {
    required_graph_attrs_.insert(attrs.begin(), attrs.end());
  }

+  void RegisterType(const std::string &type) { type_ = type; }
+
  mutable bool applied_{false};
+  std::string type_;
  std::unordered_set<std::string> required_pass_attrs_;
  std::unordered_set<std::string> required_graph_attrs_;
  std::map<std::string, boost::any> attrs_;
@@ -143,10 +165,11 @@ struct PassRegistrar : public Registrar {
    PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
                   "'%s' is registered more than once.", pass_type);
    PassRegistry::Instance().Insert(
-        pass_type, [this]() -> std::unique_ptr<Pass> {
+        pass_type, [this, pass_type]() -> std::unique_ptr<Pass> {
          std::unique_ptr<Pass> pass(new PassType());
          pass->RegisterRequiredPassAttrs(this->required_pass_attrs_);
          pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_);
+          pass->RegisterType(pass_type);
          return pass;
        });
  }

--- a/paddle/fluid/framework/ir/pass_builder.cc
+++ b/paddle/fluid/framework/ir/pass_builder.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/pass_builder.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::shared_ptr<Pass> PassBuilder::AppendPass(const std::string& pass_type) {
+  auto pass = ir::PassRegistry::Instance().Get(pass_type);
+  passes_.emplace_back(pass.release());
+  return passes_.back();
+}
+
+void PassBuilder::RemovePass(size_t idx) {
+  PADDLE_ENFORCE(passes_.size() > idx);
+  passes_.erase(passes_.begin() + idx);
+}
+
+std::shared_ptr<Pass> PassBuilder::InsertPass(size_t idx,
+                                              const std::string& pass_type) {
+  PADDLE_ENFORCE(passes_.size() >= idx);
+  std::shared_ptr<Pass> pass(
+      ir::PassRegistry::Instance().Get(pass_type).release());
+  passes_.insert(passes_.begin() + idx, std::move(pass));
+  return passes_[idx];
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/pass_builder.h
+++ b/paddle/fluid/framework/ir/pass_builder.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class PassBuilder {
+ public:
+  PassBuilder() {}
+
+  virtual ~PassBuilder() {}
+
+  // Append a new pass to the end.
+  std::shared_ptr<Pass> AppendPass(const std::string& pass_type);
+
+  // Insert a new pass after `idx`.
+  std::shared_ptr<Pass> InsertPass(size_t idx, const std::string& pass_type);
+
+  // Remove a new pass at `idx`.
+  void RemovePass(size_t idx);
+
+  // Returns a list of all passes.
+  std::vector<std::shared_ptr<Pass>> AllPasses() const { return passes_; }
+
+ protected:
+  std::vector<std::shared_ptr<Pass>> passes_;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -82,12 +82,10 @@ TEST(PassTest, TestPassAttrCheck) {
  ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
  ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);

-  try {
-    graph = pass->Apply(std::move(graph));
-  } catch (paddle::platform::EnforceNotMet e) {
-    exception = std::string(e.what());
-  }
-  ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos);
+  // Allow apply more than once.
+  graph.reset(new Graph(prog));
+  graph->Set<int>("test_graph_attr", new int);
+  graph = pass->Apply(std::move(graph));

  pass = PassRegistry::Instance().Get("test_pass");
  pass->SetNotOwned<int>("test_pass_attr", &val);

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -17,10 +17,13 @@
 #include <algorithm>
 #include <initializer_list>
 #include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
 #include <vector>
-
+#include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/memcpy.h"

 #include "glog/logging.h"

@@ -28,206 +31,436 @@ namespace paddle {
 namespace framework {

 #if defined(PADDLE_WITH_CUDA)
+namespace details {
+struct CUDABuffer {
+  void *data_{nullptr};
+  size_t size_{0};
+  platform::CUDAPlace place_;
+
+  CUDABuffer() {}
+  CUDABuffer(platform::Place place, size_t size)
+      : size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
+    data_ = memory::Alloc(place_, size);
+  }
+
+  ~CUDABuffer() { ClearMemory(); }
+
+  CUDABuffer(const CUDABuffer &o) = delete;
+  CUDABuffer &operator=(const CUDABuffer &o) = delete;
+
+  void Resize(platform::Place place, size_t size) {
+    ClearMemory();
+    place_ = boost::get<platform::CUDAPlace>(place);
+    data_ = memory::Alloc(place_, size);
+    PADDLE_ENFORCE_NOT_NULL(data_);
+    size_ = size;
+  }
+
+  void Swap(CUDABuffer &o) {
+    std::swap(data_, o.data_);
+    std::swap(place_, o.place_);
+    std::swap(size_, o.size_);
+  }
+
+ private:
+  void ClearMemory() const {
+    if (data_ != nullptr) {
+      memory::Free(place_, data_);
+    }
+  }
+};
+}  // namespace details
+
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
 class Vector {
 public:
  using value_type = T;
+  using iterator = typename std::vector<T>::iterator;
+  using const_iterator = typename std::vector<T>::const_iterator;

-  // Default ctor. Create empty Vector
-  Vector() { InitEmpty(); }
+ private:
+  // The actual class to implement vector logic
+  class VectorData {
+   public:
+    VectorData() : flag_(kDataInCPU) {}
+    VectorData(size_t count, const T &value)
+        : cpu_(count, value), flag_(kDataInCPU) {}
+    VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
+    template <typename U>
+    explicit VectorData(const std::vector<U> &dat)
+        : cpu_(dat), flag_(kDataInCPU) {}
+    ~VectorData() {}
+
+    VectorData(const VectorData &o) {
+      o.ImmutableCPU();
+      cpu_ = o.cpu_;
+      flag_ = kDataInCPU;
+    }

-  // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T &value = T()) {
-    InitEmpty();
-    if (count != 0) {
-      resize(count);
-      T *ptr = begin();
-      for (size_t i = 0; i < count; ++i) {
-        ptr[i] = value;
+    VectorData &operator=(const VectorData &o) {
+      o.ImmutableCPU();
+      cpu_ = o.cpu_;
+      flag_ = kDataInCPU;
+      details::CUDABuffer null;
+      gpu_.Swap(null);
+      return *this;
+    }
+
+    T &operator[](size_t i) {
+      MutableCPU();
+      return cpu_[i];
+    }
+
+    const T &operator[](size_t i) const {
+      ImmutableCPU();
+      return cpu_[i];
+    }
+
+    size_t size() const { return cpu_.size(); }
+
+    iterator begin() {
+      MutableCPU();
+      return cpu_.begin();
+    }
+
+    iterator end() {
+      MutableCPU();
+      return cpu_.end();
+    }
+
+    T &front() {
+      MutableCPU();
+      return cpu_.front();
+    }
+
+    T &back() {
+      MutableCPU();
+      return cpu_.back();
+    }
+
+    const_iterator begin() const {
+      ImmutableCPU();
+      return cpu_.begin();
+    }
+
+    const_iterator end() const {
+      ImmutableCPU();
+      return cpu_.end();
+    }
+
+    const T &back() const {
+      ImmutableCPU();
+      return cpu_.back();
+    }
+
+    T *data() { return &(*this)[0]; }
+
+    const T *data() const { return &(*this)[0]; }
+
+    const T &front() const {
+      ImmutableCPU();
+      return cpu_.front();
+    }
+
+    // assign this from iterator.
+    // NOTE: the iterator must support `end-begin`
+    template <typename Iter>
+    void assign(Iter begin, Iter end) {
+      MutableCPU();
+      cpu_.assign(begin, end);
+    }
+
+    // push_back. If the previous capacity is not enough, the memory will
+    // double.
+    void push_back(T elem) {
+      MutableCPU();
+      cpu_.push_back(elem);
+    }
+
+    // extend a vector by iterator.
+    // NOTE: the iterator must support end-begin
+    template <typename It>
+    void Extend(It begin, It end) {
+      MutableCPU();
+      auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
+      std::copy(begin, end, out_it);
+    }
+
+    // resize the vector
+    void resize(size_t size) {
+      MutableCPU();
+      cpu_.resize(size);
+    }
+
+    // get cuda ptr. immutable
+    const T *CUDAData(platform::Place place) const {
+      PADDLE_ENFORCE(platform::is_gpu_place(place),
+                     "CUDA Data must on CUDA place");
+      ImmutableCUDA(place);
+      return reinterpret_cast<T *>(gpu_.data_);
+    }
+
+    // get cuda ptr. mutable
+    T *CUDAMutableData(platform::Place place) {
+      const T *ptr = CUDAData(place);
+      flag_ = kDirty | kDataInCUDA;
+      return const_cast<T *>(ptr);
+    }
+
+    // clear
+    void clear() {
+      cpu_.clear();
+      flag_ = kDirty | kDataInCPU;
+    }
+
+    size_t capacity() const { return cpu_.capacity(); }
+
+    // reserve data
+    void reserve(size_t size) const { cpu_.reserve(size); }
+
+    // implicit cast operator. Vector can be cast to std::vector implicitly.
+    operator std::vector<T>() const {
+      ImmutableCPU();
+      return cpu_;
+    }
+
+    bool operator==(const VectorData &other) const {
+      ImmutableCPU();
+      other.ImmutableCPU();
+      return cpu_ == other.cpu_;
+    }
+
+    std::mutex &Mutex() const { return mtx_; }
+
+    std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
+      if (gpu_.data_ == nullptr) {
+        return nullptr;
+      } else {
+        return std::unique_ptr<platform::CUDAPlace>(
+            new platform::CUDAPlace(gpu_.place_));
      }
    }
-  }

-  // Ctor with init_list
-  Vector(std::initializer_list<T> init) {
-    if (init.size() == 0) {
-      InitEmpty();
-    } else {
-      InitByIter(init.size(), init.begin(), init.end());
+   private:
+    enum DataFlag {
+      kDataInCPU = 0x01,
+      kDataInCUDA = 0x02,
+      // kDirty means the data has been changed in one device.
+      kDirty = 0x10
+    };
+
+    void CopyToCPU() const {
+      // COPY GPU Data To CPU
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(
+              platform::Place(gpu_.place_)));
+      auto stream = dev_ctx->stream();
+      void *src = gpu_.data_;
+      void *dst = cpu_.data();
+      memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
+                   stream);
+      dev_ctx->Wait();
+    }
+
+    void MutableCPU() {
+      if (IsInCUDA() && IsDirty()) {
+        CopyToCPU();
+      }
+      flag_ = kDirty | kDataInCPU;
    }
-  }
+
+    void ImmutableCUDA(platform::Place place) const {
+      if (IsDirty()) {
+        if (IsInCPU()) {
+          CopyCPUDataToCUDA(place);
+          UnsetFlag(kDirty);
+          SetFlag(kDataInCUDA);
+        } else if (IsInCUDA() &&
+                   !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
+          PADDLE_THROW("This situation should not happen");
+          // Still dirty
+        } else {
+          // Dirty && DataInCUDA && Device is same
+          // Do nothing
+        }
+      } else {
+        if (!IsInCUDA()) {
+          // Even data is not dirty. However, data is not in CUDA. Copy data.
+          CopyCPUDataToCUDA(place);
+          SetFlag(kDataInCUDA);
+        } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
+          PADDLE_THROW("This situation should not happen.");
+        } else {
+          // Not Dirty && DataInCUDA && Device is same
+          // Do nothing.
+        }
+      }
+    }
+
+    void CopyCPUDataToCUDA(const platform::Place &place) const {
+      void *src = cpu_.data();
+      gpu_.Resize(place, cpu_.size() * sizeof(T));
+      void *dst = gpu_.data_;
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(place));
+      auto stream = dev_ctx->stream();
+      memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
+                   stream);
+    }
+
+    void ImmutableCPU() const {
+      if (IsDirty() && !IsInCPU()) {  // If data has been changed in CUDA, or
+                                      // CPU has no data.
+        CopyToCPU();
+        UnsetFlag(kDirty);
+      }
+      SetFlag(kDataInCPU);
+    }
+
+    void UnsetFlag(int flag) const { flag_ &= ~flag; }
+    void SetFlag(int flag) const { flag_ |= flag; }
+
+    bool IsDirty() const { return flag_ & kDirty; }
+
+    bool IsInCUDA() const { return flag_ & kDataInCUDA; }
+
+    bool IsInCPU() const { return flag_ & kDataInCPU; }
+
+    mutable std::vector<T> cpu_;
+    mutable details::CUDABuffer gpu_;
+    mutable int flag_;
+
+    mutable std::mutex mtx_;
+  };
+
+ public:
+  // Default ctor. Create empty Vector
+  Vector() : m_(new VectorData()) {}
+
+  // Fill vector with value. The vector size is `count`.
+  explicit Vector(size_t count, const T &value = T())
+      : m_(new VectorData(count, value)) {}
+
+  // Ctor with init_list
+  Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}

  // implicit cast from std::vector.
  template <typename U>
-  Vector(const std::vector<U> &dat) {  // NOLINT
-    if (dat.size() == 0) {
-      InitEmpty();
-    } else {
-      InitByIter(dat.size(), dat.begin(), dat.end());
-    }
+  Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) {  // NOLINT
  }

  // Copy ctor
-  Vector(const Vector<T> &other) { this->operator=(other); }
+  Vector(const Vector<T> &other) { m_ = other.m_; }

  // Copy operator
  Vector<T> &operator=(const Vector<T> &other) {
-    if (other.size() != 0) {
-      this->InitByIter(other.size(), other.begin(), other.end());
-    } else {
-      InitEmpty();
-    }
+    m_ = other.m_;
    return *this;
  }

  // Move ctor
-  Vector(Vector<T> &&other) {
-    this->size_ = other.size_;
-    this->flag_ = other.flag_;
-    if (other.cuda_vec_.memory_size()) {
-      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
-    }
-    if (other.cpu_vec_.memory_size()) {
-      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
-    }
-  }
+  Vector(Vector<T> &&other) { m_ = std::move(other.m_); }

  // CPU data access method. Mutable.
-  T &operator[](size_t i) {
-    MutableCPU();
-    return const_cast<T *>(cpu_vec_.data<T>())[i];
-  }
+  T &operator[](size_t i) { return (*m_.MutableData())[i]; }

  // CPU data access method. Immutable.
-  const T &operator[](size_t i) const {
-    ImmutableCPU();
-    return cpu_vec_.data<T>()[i];
-  }
+  const T &operator[](size_t i) const { return m_.Data()[i]; }

  // std::vector iterator methods. Based on CPU data access method
-  size_t size() const { return size_; }
+  size_t size() const { return m_.Data().size(); }

-  T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+  iterator begin() { return m_.MutableData()->begin(); }

-  T *end() {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
-  }
+  iterator end() { return m_.MutableData()->end(); }

-  T &front() { return *begin(); }
+  T &front() { return m_.MutableData()->front(); }

-  T &back() {
-    auto it = end();
-    --it;
-    return *it;
-  }
+  T &back() { return m_.MutableData()->back(); }

-  const T *begin() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
-  }
+  const_iterator begin() const { return m_.Data().begin(); }

-  const T *end() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
-  }
+  const_iterator end() const { return m_.Data().end(); }

-  const T *cbegin() const { return begin(); }
+  const_iterator cbegin() const { return begin(); }

-  const T *cend() const { return end(); }
+  const_iterator cend() const { return end(); }

-  const T &back() const {
-    auto it = end();
-    --it;
-    return *it;
-  }
+  const T &back() const { return m_.Data().back(); }

-  T *data() { return begin(); }
+  T *data() { return m_.MutableData()->data(); }

-  const T *data() const { return begin(); }
+  const T *data() const { return m_.Data().data(); }

-  const T &front() const { return *begin(); }
+  const T &front() const { return m_.Data().front(); }
  // end of std::vector iterator methods

  // assign this from iterator.
  // NOTE: the iterator must support `end-begin`
  template <typename Iter>
  void assign(Iter begin, Iter end) {
-    InitByIter(end - begin, begin, end);
+    m_.MutableData()->assign(begin, end);
  }

  // push_back. If the previous capacity is not enough, the memory will
  // double.
-  void push_back(T elem) {
-    if (size_ + 1 > capacity()) {
-      reserve((size_ + 1) << 1);
-    }
-    *end() = elem;
-    ++size_;
-  }
+  void push_back(T elem) { m_.MutableData()->push_back(elem); }

  // extend a vector by iterator.
  // NOTE: the iterator must support end-begin
  template <typename It>
  void Extend(It begin, It end) {
-    size_t pre_size = size_;
-    resize(pre_size + (end - begin));
-    T *ptr = this->begin() + pre_size;
-    for (; begin < end; ++begin, ++ptr) {
-      *ptr = *begin;
-    }
+    m_.MutableData()->Extend(begin, end);
  }

  // resize the vector
  void resize(size_t size) {
-    if (size + 1 <= capacity()) {
-      size_ = size;
-    } else {
-      MutableCPU();
-      Tensor cpu_tensor;
-      platform::Place cpu = platform::CPUPlace();
-      T *ptr = cpu_tensor.mutable_data<T>(
-          framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-      const T *old_ptr =
-          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
-      if (old_ptr != nullptr) {
-        std::copy(old_ptr, old_ptr + size_, ptr);
-      }
-      size_ = size;
-      cpu_vec_.ShareDataWith(cpu_tensor);
+    if (m_.Data().size() != size) {
+      m_.MutableData()->resize(size);
    }
  }

  // get cuda ptr. immutable
  const T *CUDAData(platform::Place place) const {
-    PADDLE_ENFORCE(platform::is_gpu_place(place),
-                   "CUDA Data must on CUDA place");
-    ImmutableCUDA(place);
-    return cuda_vec_.data<T>();
+    {
+      auto &mtx = m_.Data().Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_.Data().CUDAPlace();
+      if (cuda_place == nullptr ||
+          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+        return m_.Data().CUDAData(place);
+      }
+    }
+    // If m_ contains CUDAData in a different place. Detach manually.
+    m_.Detach();
+    return CUDAData(place);
  }

  // get cuda ptr. mutable
  T *CUDAMutableData(platform::Place place) {
-    const T *ptr = CUDAData(place);
-    flag_ = kDirty | kDataInCUDA;
-    return const_cast<T *>(ptr);
+    {
+      auto &mtx = m_.Data().Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_.Data().CUDAPlace();
+      if (cuda_place == nullptr ||
+          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+        return m_.MutableData()->CUDAMutableData(place);
+      }
+    }
+    // If m_ contains CUDAData in a different place. Detach manually.
+    m_.Detach();
+    return CUDAMutableData(place);
  }

  // clear
-  void clear() {
-    size_ = 0;
-    flag_ = kDirty | kDataInCPU;
-  }
+  void clear() { m_.MutableData()->clear(); }

-  size_t capacity() const {
-    return cpu_vec_.memory_size() / SizeOfType(typeid(T));
-  }
+  size_t capacity() const { return m_.Data().capacity(); }

  // reserve data
-  void reserve(size_t size) {
-    size_t pre_size = size_;
-    resize(size);
-    resize(pre_size);
-  }
+  void reserve(size_t size) { m_.Data().reserve(size); }

  // the unify method to access CPU or CUDA data. immutable.
  const T *Data(platform::Place place) const {
@@ -248,12 +481,7 @@ class Vector {
  }

  // implicit cast operator. Vector can be cast to std::vector implicitly.
-  operator std::vector<T>() const {
-    std::vector<T> result;
-    result.resize(size());
-    std::copy(begin(), end(), result.begin());
-    return result;
-  }
+  operator std::vector<T>() const { return m_.Data(); }

  bool operator==(const Vector<T> &other) const {
    if (size() != other.size()) return false;
@@ -267,118 +495,11 @@ class Vector {
    return true;
  }

- private:
-  void InitEmpty() {
-    size_ = 0;
-    flag_ = kDataInCPU;
-  }
-
-  template <typename Iter>
-  void InitByIter(size_t size, Iter begin, Iter end) {
-    platform::Place cpu = platform::CPUPlace();
-    T *ptr = this->cpu_vec_.template mutable_data<T>(
-        framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-    for (size_t i = 0; i < size; ++i) {
-      *ptr++ = *begin++;
-    }
-    flag_ = kDataInCPU | kDirty;
-    size_ = size;
-  }
-
-  enum DataFlag {
-    kDataInCPU = 0x01,
-    kDataInCUDA = 0x02,
-    // kDirty means the data has been changed in one device.
-    kDirty = 0x10
-  };
-
-  void CopyToCPU() const {
-    // COPY GPU Data To CPU
-    TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
-    WaitPlace(cuda_vec_.place());
-  }
-
-  void MutableCPU() {
-    if (IsInCUDA() && IsDirty()) {
-      CopyToCPU();
-    }
-    flag_ = kDirty | kDataInCPU;
-  }
-
-  void ImmutableCUDA(platform::Place place) const {
-    if (IsDirty()) {
-      if (IsInCPU()) {
-        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
-                   &cuda_vec_);
-        WaitPlace(place);
-        UnsetFlag(kDirty);
-        SetFlag(kDataInCUDA);
-      } else if (IsInCUDA() && !(place == cuda_vec_.place())) {
-        framework::Tensor tmp;
-        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
-        WaitPlace(cuda_vec_.place());
-        cuda_vec_.ShareDataWith(tmp);
-        // Still dirty
-      } else {
-        // Dirty && DataInCUDA && Device is same
-        // Do nothing
-      }
-    } else {
-      if (!IsInCUDA()) {
-        // Even data is not dirty. However, data is not in CUDA. Copy data.
-        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
-                   &cuda_vec_);
-        WaitPlace(place);
-        SetFlag(kDataInCUDA);
-      } else if (!(place == cuda_vec_.place())) {
-        framework::Tensor tmp;
-        WaitPlace(cuda_vec_.place());
-        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
-        WaitPlace(cuda_vec_.place());
-        WaitPlace(place);
-        cuda_vec_.ShareDataWith(tmp);
-      } else {
-        // Not Dirty && DataInCUDA && Device is same
-        // Do nothing.
-      }
-    }
-  }
-
-  void ImmutableCPU() const {
-    if (IsDirty() &&
-        !IsInCPU()) {  // If data has been changed in CUDA, or CPU has no data.
-      CopyToCPU();
-      UnsetFlag(kDirty);
-    }
-    SetFlag(kDataInCPU);
-  }
-
-  void UnsetFlag(int flag) const { flag_ &= ~flag; }
-  void SetFlag(int flag) const { flag_ |= flag; }
+  const void *Handle() const { return &m_.Data(); }

-  bool IsDirty() const { return flag_ & kDirty; }
-
-  bool IsInCUDA() const { return flag_ & kDataInCUDA; }
-
-  bool IsInCPU() const { return flag_ & kDataInCPU; }
-
-  static void WaitPlace(const platform::Place place) {
-    if (platform::is_gpu_place(place)) {
-      platform::DeviceContextPool::Instance()
-          .Get(boost::get<platform::CUDAPlace>(place))
-          ->Wait();
-    }
-  }
-
-  static T &EmptyDummy() {
-    static T dummy = T();
-    return dummy;
-  }
-
-  mutable int flag_;
-  mutable Tensor cpu_vec_;
-  mutable Tensor cuda_vec_;
-  size_t size_;
+ private:
+  // Vector is an COW object.
+  mutable details::COWPtr<VectorData> m_;
 };

 #else  // PADDLE_WITH_CUDA

--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+
+// These code can be shared with Executor.
+static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
+  if (var_type == proto::VarType::LOD_TENSOR) {
+    var->GetMutable<LoDTensor>();
+  } else if (var_type == proto::VarType::SELECTED_ROWS) {
+    var->GetMutable<SelectedRows>();
+  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::FETCH_LIST) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::STEP_SCOPES) {
+    var->GetMutable<std::vector<framework::Scope>>();
+  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
+    var->GetMutable<LoDRankTable>();
+  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
+  } else if (var_type == proto::VarType::PLACE_LIST) {
+    var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarType::READER) {
+    var->GetMutable<ReaderHolder>();
+  } else if (var_type == proto::VarType::CHANNEL) {
+    var->GetMutable<ChannelHolder>();
+  } else if (var_type == proto::VarType::RAW) {
+    // GetMutable will be called in operator
+  } else {
+    PADDLE_THROW(
+        "Variable type %d is not in "
+        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
+        var_type);
+  }
+}
+
+void NaiveExecutor::Prepare(Scope *parent_scope,
+                            const ProgramDesc &program_desc, int block_id,
+                            bool with_feed_fetch_ops) {
+  if (!parent_scope) {
+    scope_ = new framework::Scope;
+  } else {
+    scope_ = &parent_scope->NewScope();
+  }
+  CreateVariables(program_desc, scope_, block_id);
+  CreateOps(program_desc, block_id, with_feed_fetch_ops);
+}
+
+void NaiveExecutor::Run() {
+  for (auto &op : ops_) {
+    VLOG(4) << "run " << op->Type();
+    op->Run(*scope_, place_);
+  }
+}
+
+void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope,
+                                    int block_id) {
+  PADDLE_ENFORCE(scope);
+  auto &global_block = desc.Block(block_id);
+
+  const Scope *ancestor_scope = scope;
+  while (ancestor_scope->parent()) {
+    ancestor_scope = ancestor_scope->parent();
+  }
+
+  if (ancestor_scope != scope) {
+    for (auto &var : global_block.AllVars()) {
+      if (var->Name() == framework::kEmptyVarName) {
+        continue;
+      }
+      // Create persistable vars in ancestor scope.
+      if (var->Persistable()) {
+        auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " global, which pointer is " << ptr;
+      } else {  // Create temporary variables in local scope.
+        auto *ptr = scope->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " locally, which pointer is " << ptr;
+      }
+    }
+  } else {
+    for (auto &var : global_block.AllVars()) {
+      auto *ptr = scope->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+              << ptr;
+    }
+  }
+}
+
+void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,
+                              bool with_feed_fetch_ops) {
+  for (const auto &op_desc : desc.Block(block_id).AllOps()) {
+    if (!with_feed_fetch_ops &&
+        (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
+      string::PrettyLogEndl(string::Style::detail(), "---  skip [%s], %s -> %s",
+                            op_desc->Input("X")[0], op_desc->Type(),
+                            op_desc->Output("Out")[0]);
+      continue;
+    }
+    ops_.emplace_back(OpRegistry::CreateOp(*op_desc));
+  }
+}
+
+LoDTensor *NaiveExecutor::FindTensor(const std::string &name) {
+  PADDLE_ENFORCE(scope_, "Need to init scope first");
+  auto *var = scope_->FindVar(name);
+  PADDLE_ENFORCE(var, "No variable [%s] in the scope");
+  auto *tensor = const_cast<LoDTensor *>(&var->Get<LoDTensor>());
+  return tensor;
+}
+
+void NaiveExecutor::CleanFeedFetchOps() {
+  std::vector<std::unique_ptr<OperatorBase>> ops;
+  for (auto &op : ops_) {
+    if (op->Type() != "feed" && op->Type() != "fetch") {
+      ops.emplace_back(std::move(op));
+    }
+  }
+  ops_.swap(ops);
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+/*
+ * Simple, intuitive and effective. Only single thread is supported, and
+ * currently designed for inference.
+ */
+class NaiveExecutor {
+ public:
+  explicit NaiveExecutor(const platform::Place& place) : place_(place) {}
+
+  // Create child scope.
+  // Create variables.
+  // @with_feed_fetch_ops: whether to work with the feed and fetch operators.
+  void Prepare(Scope* parent_scope, const ProgramDesc& program_desc,
+               int block_id, bool with_feed_fetch_ops);
+
+  // Run all the operators.
+  void Run();
+
+  // Get an tensor to operating directly, without the need for feed_ops.
+  LoDTensor* FindTensor(const std::string& name);
+
+  Scope* scope() { return scope_; }
+
+  void CleanFeedFetchOps();
+
+ protected:
+  void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);
+
+  void CreateOps(const ProgramDesc& desc, int block_id,
+                 bool with_feed_fetch_ops);
+
+ private:
+  const platform::Place place_;
+  // Catch the required resource to avoid recreate.
+  std::vector<std::unique_ptr<OperatorBase>> ops_;
+  Scope* scope_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/naive_executor_test.cc
+++ b/paddle/fluid/framework/naive_executor_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/naive_executor.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+TEST(NaiveExecutor, Basic) {
+  ProgramDesc program;
+  auto* main_block = program.MutableBlock(0);
+  auto* a = main_block->Var("a");  // input
+  auto* b = main_block->Var("b");  // input
+  auto* c = main_block->Var("c");  // input
+  a->SetType(proto::VarType::LOD_TENSOR);
+  b->SetType(proto::VarType::LOD_TENSOR);
+  c->SetType(proto::VarType::LOD_TENSOR);
+
+  auto* add = main_block->AppendOp();
+  add->SetType("elementwise_add");
+  add->SetInput("X", {"a"});
+  add->SetInput("Y", {"b"});
+  add->SetOutput("Out", {"c"});
+
+  auto place = platform::CPUPlace();
+  NaiveExecutor exe(place);
+  exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/);
+  auto* a_tensor = exe.FindTensor("a");
+  auto* b_tensor = exe.FindTensor("b");
+  auto* c_tensor = exe.FindTensor("c");
+
+  a_tensor->Resize({1, 4});
+  b_tensor->Resize({1, 4});
+  c_tensor->Resize({1, 4});
+  b_tensor->mutable_data<float>(place);
+  a_tensor->mutable_data<float>(place);
+
+  float a_arr[] = {0, 1, 2, 3};
+  float b_arr[] = {0.0, .1, .2, .3};
+
+  std::copy_n(a_arr, 4, a_tensor->mutable_data<float>(place));
+  std::copy_n(b_arr, 4, b_tensor->mutable_data<float>(place));
+
+  exe.Run();
+
+  auto* c_data = c_tensor->mutable_data<float>(place);
+  for (int i = 0; i < 4; i++) {
+    EXPECT_NEAR(c_data[i], 1.1 * i, 1e-3);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
+
+USE_OP(elementwise_add);
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -154,9 +154,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
      platform::SetDeviceId(dev_id);
 #endif
    }
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(Type(), pool.Get(place));
+
+    if (platform::IsProfileEnabled()) {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      platform::RecordEvent record_event(Type(), pool.Get(place));
+    }
+
    RunImpl(scope, place);
+
    if (VLOG_IS_ON(3)) {
      VLOG(3) << place << " " << DebugStringEx(&scope);
    }

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,21 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/parallel_executor.h"
-
 #include <string>
 #include <tuple>
 #include <vector>
+#include "paddle/fluid/framework/ir/graph_helper.h"

 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"

 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -35,80 +33,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
-    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &param_names,
-    const std::vector<Scope *> &local_scopes, const bool use_cuda,
-#ifdef PADDLE_WITH_CUDA
-    const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
-#else
-    const BuildStrategy &strategy) {
-#endif
-  // Convert the program to graph.
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
-
-  // Apply a graph viz pass to record a graph.
-  if (!strategy.debug_graphviz_path_.empty()) {
-    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
-    const std::string graph_path = string::Sprintf(
-        "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
-    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
-    graph = viz_pass->Apply(std::move(graph));
-  }
-
-  // Apply op fusion.
-  if (strategy.fuse_elewise_add_act_ops_) {
-    auto fuse_elewise_add_act_pass =
-        ir::PassRegistry::Instance().Get("fuse_elewise_add_act_pass");
-    graph = fuse_elewise_add_act_pass->Apply(std::move(graph));
-    // Apply a graph viz pass to record a graph.
-    if (!strategy.debug_graphviz_path_.empty()) {
-      auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
-      const std::string graph_path = string::Sprintf(
-          "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
-      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
-      graph = viz_pass->Apply(std::move(graph));
-    }
-  }
-
-  // Convert graph to run on multi-devices.
-  auto multi_devices_pass =
-      ir::PassRegistry::Instance().Get("multi_devices_pass");
-  multi_devices_pass->SetNotOwned<const std::vector<platform::Place>>("places",
-                                                                      &places);
-  multi_devices_pass->SetNotOwned<const std::string>("loss_var_name",
-                                                     &loss_var_name);
-  multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
-      "params", &param_names);
-  multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
-                                                              &local_scopes);
-  multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
-
-#ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
-  multi_devices_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
-#endif
-  graph = multi_devices_pass->Apply(std::move(graph));
-
-  // Apply a graph print pass to record a graph with device info.
-  if (!strategy.debug_graphviz_path_.empty()) {
-    auto multi_devices_print_pass =
-        ir::PassRegistry::Instance().Get("multi_devices_print_pass");
-    multi_devices_print_pass->SetNotOwned<const std::string>(
-        "debug_graphviz_path", &strategy.debug_graphviz_path_);
-    multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
-        "graph_printer", new details::GraphvizSSAGraphPrinter);
-    graph = multi_devices_print_pass->Apply(std::move(graph));
-  }
-
-  // Verify that the graph is correct for multi-device executor.
-  auto multi_devices_check_pass =
-      ir::PassRegistry::Instance().Get("multi_devices_check_pass");
-  graph = multi_devices_check_pass->Apply(std::move(graph));
-  return graph;
-}
-
 class ParallelExecutorPrivate {
 public:
  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
@@ -199,10 +123,9 @@ ParallelExecutor::ParallelExecutor(
 // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
 // ncclOp
 #ifdef PADDLE_WITH_CUDA
-  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
      main_program, member_->places_, loss_var_name, params,
-      member_->local_scopes_, member_->use_cuda_, build_strategy,
-      member_->nccl_ctxs_.get());
+      member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());

  auto max_memory_size = GetEagerDeletionThreshold();
  if (max_memory_size >= 0) {
@@ -228,11 +151,17 @@ ParallelExecutor::ParallelExecutor(
    }
  }
 #else
-  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
-      main_program, member_->places_, loss_var_name, params,
-      member_->local_scopes_, member_->use_cuda_, build_strategy);
+  std::unique_ptr<ir::Graph> graph =
+      build_strategy.Apply(main_program, member_->places_, loss_var_name,
+                           params, member_->local_scopes_, member_->use_cuda_);
 #endif

+  // If the loss_var_name is given, the number of graph should be only one.
+  if (loss_var_name.size()) {
+    PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
+                      "The number of graph should be only one");
+  }
+
  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
    member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
        exec_strategy, member_->local_scopes_, places, std::move(graph)));
@@ -373,12 +302,6 @@ ParallelExecutor::~ParallelExecutor() {

 }  // namespace framework
 }  // namespace paddle
-
-USE_PASS(fuse_elewise_add_act_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(multi_devices_pass);
-USE_PASS(multi_devices_check_pass);
-USE_PASS(multi_devices_print_pass);
 #ifdef PADDLE_WITH_CUDA
 USE_PASS(reference_count_pass);
 #endif
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,14 +14,14 @@ limitations under the License. */

 #pragma once

-#include <paddle/fluid/framework/details/build_strategy.h>
 #include <atomic>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
+#include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -20,6 +20,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"

+// The mutex is not needed by training and inference, only for distribution.
+#if PADDLE_WITH_DISTRIBUTE
+#define WITH_LOCK 1
+#else
+#define WITH_LOCK 0
+#endif
+
 DEFINE_bool(benchmark, false,
            "Doing memory benchmark. It will make deleting scope synchronized, "
            "and add some memory usage logs."
@@ -49,18 +56,24 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }

 Scope& Scope::NewScope() const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  kids_.push_back(new Scope(this));
  return *kids_.back();
 }

 Variable* Scope::Var(const std::string& name) {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  return VarInternal(name);
 }

 Variable* Scope::Var(std::string* name) {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  if (name != nullptr) {
    *name = new_name;
@@ -69,29 +82,39 @@ Variable* Scope::Var(std::string* name) {
 }

 Variable* Scope::FindVar(const std::string& name) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  return FindVarInternal(name);
 }

 const Scope* Scope::FindScope(const Variable* var) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  return FindScopeInternal(var);
 }

 void Scope::DropKids() {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  for (Scope* s : kids_) delete s;
  kids_.clear();
 }

 bool Scope::HasKid(const Scope* scope) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  return it != this->kids_.end();
 }

 std::vector<std::string> Scope::LocalVarNames() const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  std::vector<std::string> known_vars;
  known_vars.reserve(this->vars_.size());
  for (auto& p : vars_) {
@@ -101,7 +124,9 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }

 void Scope::DeleteScope(Scope* scope) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
  this->kids_.erase(it);
@@ -114,7 +139,9 @@ void Scope::DeleteScope(Scope* scope) const {
 }

 void Scope::EraseVars(const std::vector<std::string>& var_names) {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  std::set<std::string> var_set(var_names.begin(), var_names.end());
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
@@ -127,12 +154,16 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {

 void Scope::Rename(const std::string& origin_name,
                   const std::string& new_name) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  RenameInternal(origin_name, new_name);
 }

 std::string Scope::Rename(const std::string& origin_name) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  RenameInternal(origin_name, new_name);
  return new_name;

--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test {
    selected_rows_.reset(new SelectedRows(rows, height));

    Tensor* value = selected_rows_->mutable_value();
-    value->mutable_data<float>(
+    auto* data = value->mutable_data<float>(
        make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
+    for (int64_t i = 0; i < value->numel(); ++i) {
+      data[i] = static_cast<float>(i);
+    }
  }

 protected:
@@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
  ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
  ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
+  auto* dst_data = dst_tensor.value().data<float>();
+  for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) {
+    ASSERT_EQ(dst_data[i], static_cast<float>(i));
+  }
 }

 TEST(SelectedRows, SparseTable) {

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -53,7 +53,7 @@ if(NOT APPLE)
 endif()

 if(WITH_TESTING)
-  # tests/book depends the models that generated by python/paddle/fluid/tests/book
+    # tests/book depends the models that generated by python/paddle/fluid/tests/book
  add_subdirectory(tests/book)
  if(WITH_INFERENCE_API_TEST)
    add_subdirectory(tests/api)

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
 cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
 set(analysis_deps
-    framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log)
+        framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log)

 cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
  analyzer.cc

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -37,12 +37,16 @@ TEST(Analyzer, analysis_without_tensorrt) {
 TEST(Analyzer, analysis_with_tensorrt) {
  FLAGS_IA_enable_tensorrt_subgraph_engine = true;
  Argument argument;
+  argument.Set<int>("minimum_subgraph_size", new int(0));
+  argument.Set<int>("max_batch_size", new int(3));
+  argument.Set<int>("workspace_size", new int(1 << 20));
+  argument.Set<std::string>("precision_mode", new std::string("FP32"));
  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
  Analyzer analyser;
  analyser.Run(&argument);
 }

-void TestWord2vecPrediction(const std::string &model_path) {
+void TestWord2vecPrediction(const std::string& model_path) {
  NativeConfig config;
  config.model_dir = model_path;
  config.use_gpu = false;
@@ -73,8 +77,8 @@ void TestWord2vecPrediction(const std::string &model_path) {
  // The outputs' buffers are in CPU memory.
  for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
    LOG(INFO) << "data: "
-              << static_cast<float *>(outputs.front().data.data())[i];
-    PADDLE_ENFORCE(static_cast<float *>(outputs.front().data.data())[i],
+              << static_cast<float*>(outputs.front().data.data())[i];
+    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
                   result[i]);
  }
 }

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -97,8 +97,10 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
  }
 }

-void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
+void CreateTrtEngineOp(Node *node, Argument *argument,
                       framework::proto::BlockDesc *block) {
+  PADDLE_ENFORCE(argument->main_dfg.get());
+  const DataFlowGraph &graph = *(argument->main_dfg);
  static int counter{0};
  PADDLE_ENFORCE(node->IsFunctionBlock());
  framework::OpDesc desc;
@@ -204,7 +206,10 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,

  PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
  // Set attrs
+
  SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
+  SetAttr(desc.Proto(), "max_batch_size", argument->Get<int>("max_batch_size"));
+  SetAttr(desc.Proto(), "workspace_size", argument->Get<int>("workspace_size"));
  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
  SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
@@ -248,7 +253,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
  *block_desc.Proto()->mutable_vars() =
      argument_->origin_program_desc->blocks(0).vars();
  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
-  CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
+  CreateTrtEngineOp(node, argument_, block_desc.Proto());
  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
  auto *op = main_block->add_ops();
  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");

--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -309,6 +309,8 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
 void SubGraphFuse::ReplaceNodesWithSubGraphs() {
  auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
  for (auto &subgraph : subgraphs) {
+    if (subgraph.size() <= argument_->Get<int>("minimum_subgraph_size"))
+      continue;
    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
    // replace this sub-graph with the first node. Two steps: 1. Create a Block
    // Node that contains this subgraph 2. Mark the nodes inside the sub-graph

--- a/paddle/fluid/inference/analysis/subgraph_splitter.h
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
@@ -20,6 +20,7 @@ limitations under the License. */

 #include <vector>

+#include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/node.h"

@@ -63,8 +64,11 @@ class SubGraphFuse {
 public:
  using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;

-  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
-      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller,
+               Argument *argument)
+      : graph_(graph),
+        node_inside_subgraph_teller_(teller),
+        argument_(argument) {}

  // The main method which run all the logic.
  void operator()();
@@ -76,6 +80,7 @@ class SubGraphFuse {
 private:
  DataFlowGraph *graph_;
  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+  Argument *argument_;
 };

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -66,10 +66,12 @@ TEST(SubGraphSplitter, Split) {
 TEST(SubGraphSplitter, Fuse) {
  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
  auto dfg = ProgramDescToDFG(desc);
+  Argument argument;
+  argument.Set<int>("minimum_subgraph_size", new int(3));

  size_t count0 = dfg.nodes.size();

-  SubGraphFuse fuse(&dfg, teller);
+  SubGraphFuse fuse(&dfg, teller, &argument);
  fuse();

  int count1 = 0;

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
    : node_inside_subgraph_teller_(teller) {}

 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
-  SubGraphFuse(graph, node_inside_subgraph_teller_)();
+  SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)();
  VLOG(4) << "debug info "
          << graph->HumanReadableInfo(false /*show_values*/,
                                      true /*show_functions*/);

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -33,7 +33,10 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {

  explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);

-  bool Initialize(Argument* argument) override { return true; }
+  bool Initialize(Argument* argument) override {
+    argument_ = argument;
+    return true;
+  }

  // This class get a sub-graph as input and determine whether to transform this
  // sub-graph into TensorRT.
@@ -46,6 +49,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {

 private:
  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+  Argument* argument_;
 };

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -36,6 +36,10 @@ TEST(TensorRTSubGraphPass, main) {
  };

  Argument argument(FLAGS_inference_model_dir);
+  argument.Set<int>("minimum_subgraph_size", new int(0));
+  argument.Set<int>("max_batch_size", new int(3));
+  argument.Set<int>("workspace_size", new int(1 << 20));
+  argument.Set<std::string>("precision_mode", new std::string("FP32"));

  DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
  DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,10 +18,10 @@ if(APPLE)
 endif(APPLE)


-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB})
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})

 if(WITH_GPU AND TENSORRT_FOUND)
-    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
+    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
 endif()

 function(inference_api_test TARGET_NAME)
@@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME)
    endif(WITH_TESTING)
 endfunction(inference_api_test)

-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
+cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
+cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
 cc_test(test_paddle_inference_api
        SRCS api_tester.cc
        DEPS paddle_inference_api)
@@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_api_impl SRC api_impl_tester.cc
                    ARGS test_word2vec test_image_classification)

+set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
+        ARGS --dirname=${PYTHON_TESTS_DIR}/book)
+
 if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine
        SRCS api_tensorrt_subgraph_engine.cc
-        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter)
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy)

 inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()

 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
    # compile the libinference_anakin_api.a and anakin.so.
-    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml)
-    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
+    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml scope zero_copy_tensor_dummy)
+    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber scope)
    function(anakin_target target_name)
      target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
    endfunction()

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -16,11 +16,15 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/api/timer.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -28,8 +32,11 @@ DECLARE_bool(profile);

 namespace paddle {

+using contrib::AnalysisConfig;
+
 bool AnalysisPredictor::Init(
-    const std::shared_ptr<framework::Scope>& parent_scope) {
+    const std::shared_ptr<framework::Scope> &parent_scope,
+    const std::shared_ptr<framework::ProgramDesc> &program) {
  VLOG(3) << "Predictor::init()";
 #if !defined(_WIN32)
  if (FLAGS_profile) {
@@ -43,7 +50,8 @@ bool AnalysisPredictor::Init(

  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
-    LOG(WARNING) << "ir optimize only supports CPU currently";
+    LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
+                    "is turned false.";
    config_.enable_ir_optim = false;
  } else {
    place_ = paddle::platform::CPUPlace();
@@ -56,37 +64,134 @@ bool AnalysisPredictor::Init(
    scope_.reset(new paddle::framework::Scope());
  }

-  executor_.reset(new paddle::framework::Executor(place_));
+  executor_.reset(new paddle::framework::NaiveExecutor(place_));

-  // Initialize the inference program
-  if (!config_.model_dir.empty()) {
-    // Parameters are saved in separate files sited in
-    // the specified `dirname`.
-    inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
-                                                 config_.model_dir);
-  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
-    // All parameters are saved in a single file.
-    // The file names should be consistent with that used
-    // in Python API `fluid.io.save_inference_model`.
-    inference_program_ = paddle::inference::Load(
-        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+  if (!program) {
+    if (!LoadProgramDesc()) return false;
+    OptimizeInferenceProgram();
  } else {
-    LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
+    inference_program_ = program;
+  }
+  executor_->Prepare(scope_.get(), *inference_program_, 0,
+                     config_.use_feed_fetch_ops);
+
+  // Get the feed_target_names and fetch_target_names
+  PrepareFeedFetch();
+  return true;
+}
+
+bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
+                            std::vector<PaddleTensor> *output_data,
+                            int batch_size) {
+  VLOG(3) << "Predictor::predict";
+  inference::Timer timer;
+  timer.tic();
+  // set feed variable
+  std::vector<framework::LoDTensor> feeds;
+  framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
+  if (!SetFeed(inputs, scope)) {
+    LOG(ERROR) << "fail to set feed";
    return false;
  }
+  // Run the inference program
+  // if share variables, we need not create variables
+  executor_->Run();

-  OptimizeInferenceProgram();
-  if (config_._use_mkldnn) {
-    executor_->EnableMKLDNN(*inference_program_);
+  // get fetch variable
+  if (!GetFetch(output_data, scope)) {
+    LOG(ERROR) << "fail to get fetches";
+    return false;
  }
-  ctx_ = executor_->Prepare(*inference_program_, 0);
+  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  return true;
+}

-  VLOG(5) << "to create variables";
-  PADDLE_ENFORCE(scope_.get());
-  executor_->CreateVariables(*inference_program_,
-                             sub_scope_ ? sub_scope_ : scope_.get(), 0);
-  // Get the feed_target_names and fetch_target_names
-  PrepareFeedFetch();
+bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
+                                framework::Scope *scope) {
+  VLOG(3) << "Predictor::set_feed";
+  if (inputs.size() != feeds_.size()) {
+    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
+               << inputs.size();
+    return false;
+  }
+
+  // Cache the inputs memory for better concurrency performance.
+  feed_tensors_.resize(inputs.size());
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto &input = feed_tensors_[i];
+    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
+    void *input_ptr;
+    if (inputs[i].dtype == PaddleDType::INT64) {
+      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
+      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+    } else {
+      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
+      return false;
+    }
+
+    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+    std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
+                inputs[i].data.length());
+    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
+    framework::LoD lod;
+    for (auto &level : inputs[i].lod) {
+      lod.emplace_back(level);
+    }
+    input.set_lod(lod);
+    int idx = -1;
+    if (config_.specify_input_name) {
+      idx = feed_names_[inputs[i].name];
+    } else {
+      idx = boost::get<int>(feeds_[i]->GetAttr("col"));
+    }
+    framework::SetFeedVariable(scope, input, "feed", idx);
+  }
+  return true;
+}
+
+template <typename T>
+void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
+                                    PaddleTensor *output) {
+  // set shape.
+  auto shape = framework::vectorize(fetch.dims());
+  output->shape.assign(shape.begin(), shape.end());
+  // set data.
+  const T *data = fetch.data<T>();
+  int num_elems = inference::VecReduceToInt(shape);
+  output->data.Resize(num_elems * sizeof(T));
+  // The fetched tensor output by fetch op, should always in CPU memory, so just
+  // copy.
+  memcpy(output->data.data(), data, num_elems * sizeof(T));
+  // set lod
+  output->lod.clear();
+  for (auto &level : fetch.lod()) {
+    output->lod.emplace_back(level.begin(), level.end());
+  }
+}
+
+bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
+                                 framework::Scope *scope) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetchs_.size());
+  for (size_t i = 0; i < fetchs_.size(); ++i) {
+    int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
+    PADDLE_ENFORCE((size_t)idx == i);
+    framework::LoDTensor &fetch =
+        framework::GetFetchVariable(*scope, "fetch", idx);
+    auto type = fetch.type();
+    auto output = &(outputs->at(i));
+    if (type == typeid(float)) {
+      GetFetchOne<float>(fetch, output);
+      output->dtype = PaddleDType::FLOAT32;
+    } else if (type == typeid(int64_t)) {
+      GetFetchOne<int64_t>(fetch, output);
+      output->dtype = PaddleDType::INT64;
+    } else {
+      LOG(ERROR) << "unknown type, only support float32 and int64 now.";
+    }
+  }
  return true;
 }

@@ -107,6 +212,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
        new std::string(config_.prog_file));
    argument_.fluid_model_param_path.reset(new std::string(config_.param_file));
  }
+
  argument_.origin_program_desc.reset(
      new ProgramDesc(*inference_program_->Proto()));
  PADDLE_ENFORCE(
@@ -127,9 +233,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 }

 template <>
-std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
-    const contrib::AnalysisConfig& config) {
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
+    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
  VLOG(3) << "create AnalysisConfig";
  if (config.use_gpu) {
    // 1. GPU memeroy
@@ -150,15 +255,90 @@ CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
  }

  std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
-  if (!dynamic_cast<AnalysisPredictor*>(predictor.get())->Init(nullptr)) {
+  if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) {
    return nullptr;
  }
  return predictor;
 }

+void AnalysisPredictor::PrepareFeedFetch() {
+  for (auto *op : inference_program_->Block(0).AllOps()) {
+    if (op->Type() == "feed") {
+      int idx = boost::get<int>(op->GetAttr("col"));
+      if (feeds_.size() <= static_cast<size_t>(idx)) {
+        feeds_.resize(idx + 1);
+      }
+      feeds_[idx] = op;
+      feed_names_[op->Output("Out")[0]] = idx;
+    } else if (op->Type() == "fetch") {
+      int idx = boost::get<int>(op->GetAttr("col"));
+      if (fetchs_.size() <= static_cast<size_t>(idx)) {
+        fetchs_.resize(idx + 1);
+      }
+      fetchs_[idx] = op;
+    }
+  }
+}
+
+std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
+  res->input_or_output_ = true;
+  res->SetName(name);
+  return res;
+}
+
+std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
+  res->input_or_output_ = false;
+  res->SetName(name);
+  return res;
+}
+
+bool AnalysisPredictor::ZeroCopyRun() {
+  executor_->Run();
+  return true;
+}
+
+bool AnalysisPredictor::LoadProgramDesc() {
+  // Initialize the inference program
+  std::unique_ptr<framework::Executor> tmp_exe(
+      new framework::Executor(platform::CPUPlace()));
+  if (!config_.model_dir.empty()) {
+    // Parameters are saved in separate files sited in
+    // the specified `dirname`.
+    inference_program_ = paddle::inference::Load(
+        static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
+        config_.model_dir);
+  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+    // All parameters are saved in a single file.
+    // The file names should be consistent with that used
+    // in Python API `fluid.io.save_inference_model`.
+    inference_program_ = paddle::inference::Load(
+        static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
+        config_.prog_file, config_.param_file);
+  } else {
+    LOG(ERROR) << string::Sprintf(
+        "not valid model path '%s' or program path '%s'.", config_.model_dir,
+        config_.param_file);
+    return false;
+  }
+  return true;
+}
+std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
+  auto *x = new AnalysisPredictor(config_);
+  x->Init(scope_, inference_program_);
+  return std::unique_ptr<PaddlePredictor>(x);
+}
+
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
-    const contrib::AnalysisConfig& config) {
+    const contrib::AnalysisConfig &config) {
  return CreatePaddlePredictor<contrib::AnalysisConfig,
                               PaddleEngineKind::kAnalysis>(config);
 }

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -12,42 +12,81 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#pragma once
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/string/printf.h"

 namespace paddle {

 using inference::analysis::Argument;
 using inference::analysis::Analyzer;
 using framework::proto::ProgramDesc;
+using framework::NaiveExecutor;
+using contrib::AnalysisConfig;

 /* This predictor is based on the original native predictor with IR and Analysis
 * support. It will optimize IR and Parameters in the runtime.
 * TODO(Superjomn) Replace the Navive predictor?
 */
-class AnalysisPredictor : public NativePaddlePredictor {
+class AnalysisPredictor : public PaddlePredictor {
 public:
-  explicit AnalysisPredictor(const contrib::AnalysisConfig& config)
-      : NativePaddlePredictor(config), config_(config) {}
+  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {}

-  bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
+  bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
+            const std::shared_ptr<framework::ProgramDesc> &program = nullptr);

-  bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data,
-           int batch_size = -1) override {
-    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
-  }
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = -1) override;
+
+  std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string &name) override;
+  std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string &name) override;
+
+  bool ZeroCopyRun() override;
+
+  void PrepareFeedFetch();

  void OptimizeInferenceProgram();

-  Argument& analysis_argument() { return argument_; }
+  Argument &analysis_argument() { return argument_; }
+
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+  framework::Scope *scope() { return executor_->scope(); }
+  framework::ProgramDesc &program() { return *inference_program_; }
+
+ protected:
+  bool LoadProgramDesc();
+
+  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
+               framework::Scope *scope);
+  bool GetFetch(std::vector<PaddleTensor> *output_data,
+                framework::Scope *scope);
+  template <typename T>
+  void GetFetchOne(const framework::LoDTensor &fetchs,
+                   PaddleTensor *output_data);

 private:
  contrib::AnalysisConfig config_;
  Argument argument_;
+  std::unique_ptr<NaiveExecutor> executor_;
+  platform::Place place_;
+  std::shared_ptr<framework::Scope> scope_;
+  framework::Scope *sub_scope_{nullptr};
+  std::shared_ptr<framework::ProgramDesc> inference_program_;
+  std::vector<framework::OpDesc *> feeds_;
+  std::map<std::string, size_t> feed_names_;
+  std::vector<framework::OpDesc *> fetchs_;
+  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
+  // concurrency problems, so cache them.
+  std::vector<framework::LoDTensor> feed_tensors_;
 };

 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+DEFINE_string(dirname, "", "dirname to tests.");
+
+namespace paddle {
+namespace inference {
+using contrib::AnalysisConfig;
+
+TEST(AnalysisPredictor, ZeroCopy) {
+  AnalysisConfig config;
+  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
+  config.use_feed_fetch_ops = false;
+
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+
+  auto w0 = predictor->GetInputTensor("firstw");
+  auto w1 = predictor->GetInputTensor("secondw");
+  auto w2 = predictor->GetInputTensor("thirdw");
+  auto w3 = predictor->GetInputTensor("forthw");
+
+  w0->Reshape({4, 1});
+  w1->Reshape({4, 1});
+  w2->Reshape({4, 1});
+  w3->Reshape({4, 1});
+
+  auto* w0_data = w0->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w1_data = w1->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w2_data = w2->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w3_data = w3->mutable_data<int64_t>(PaddlePlace::kCPU);
+
+  for (int i = 0; i < 4; i++) {
+    w0_data[i] = i;
+    w1_data[i] = i;
+    w2_data[i] = i;
+    w3_data[i] = i;
+  }
+
+  predictor->ZeroCopyRun();
+
+  auto out = predictor->GetOutputTensor("fc_1.tmp_2");
+  PaddlePlace place;
+  int size = 0;
+  auto* out_data = out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  LOG(INFO) << "output_data: " << out_data;
+}
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.

+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle_inference_api.h"

 namespace paddle {

@@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) {
  }
 }

-PaddleBuf::PaddleBuf(PaddleBuf&& other)
+PaddleBuf::PaddleBuf(PaddleBuf &&other)
    : data_(other.data_),
      length_(other.length_),
      memory_owned_(other.memory_owned_) {
@@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
  other.length_ = 0;
 }

-PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
+PaddleBuf::PaddleBuf(const PaddleBuf &other) { *this = other; }

-PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
  if (!other.memory_owned_) {
    data_ = other.data_;
    length_ = other.length_;
@@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
  return *this;
 }

-PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
+PaddleBuf &PaddleBuf::operator=(PaddleBuf &&other) {
  // only the buffer with external memory can be copied
  data_ = other.data_;
  length_ = other.length_;
@@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) {
  }
 }

-void PaddleBuf::Reset(void* data, size_t length) {
+void PaddleBuf::Reset(void *data, size_t length) {
  Free();
  memory_owned_ = false;
  data_ = data;
@@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
 void PaddleBuf::Free() {
  if (memory_owned_ && data_) {
    PADDLE_ENFORCE_GT(length_, 0);
-    free(static_cast<char*>(data_));
+    free(static_cast<char *>(data_));
    data_ = nullptr;
    length_ = 0;
  }

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -145,7 +145,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
  VLOG(4) << "Run prepared context";
  executor_->RunPreparedContext(ctx_.get(), scope,
                                false, /* don't create local scope each time*/
-                                false /* don't create variable eatch time */);
+                                false /* don't create variable each time */);
  VLOG(4) << "Finish prepared context";
  // get fetch variable
  if (!GetFetch(output_data, scope)) {

--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at

-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0

-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #pragma once

@@ -30,6 +30,8 @@

 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor {

  ~NativePaddlePredictor() override;

+  framework::Scope *scope() { return sub_scope_ ? sub_scope_ : scope_.get(); }
+
 protected:
  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
               framework::Scope *scope);

--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -43,7 +43,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {

 NativeConfig GetConfig() {
  NativeConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
  LOG(INFO) << "dirname  " << config.model_dir;
  config.fraction_of_gpu_memory = 0.15;
 #ifdef PADDLE_WITH_CUDA
@@ -110,7 +110,7 @@ void MainImageClassification(bool use_gpu) {
  NativeConfig config = GetConfig();
  config.use_gpu = use_gpu;
  config.model_dir =
-      FLAGS_dirname + "image_classification_resnet.inference.model";
+      FLAGS_dirname + "/image_classification_resnet.inference.model";

  const bool is_combined = false;
  std::vector<std::vector<int64_t>> feed_target_shapes =
@@ -214,7 +214,7 @@ void MainThreadsImageClassification(bool use_gpu) {
  NativeConfig config = GetConfig();
  config.use_gpu = use_gpu;
  config.model_dir =
-      FLAGS_dirname + "image_classification_resnet.inference.model";
+      FLAGS_dirname + "/image_classification_resnet.inference.model";

  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
  std::vector<framework::LoDTensor> jobs(num_jobs);

--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -35,8 +35,6 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
    FLAGS_IA_enable_tensorrt_subgraph_engine = true;
    VLOG(3) << "Predictor::init()";
-    FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
-    FLAGS_tensorrt_workspace_size = config_.workspace_size;
    if (config_.use_gpu) {
      place_ = paddle::platform::CUDAPlace(config_.device);
    } else {
@@ -92,6 +90,14 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
  void OptimizeInferenceProgram() {
    // Analyze inference_program
    Argument argument;
+
+    argument.Set<int>("minimum_subgraph_size",
+                      new int(config_.minimum_subgraph_size));
+    argument.Set<int>("max_batch_size", new int(config_.max_batch_size));
+    argument.Set<int>("workspace_size", new int(config_.workspace_size));
+    argument.Set<std::string>("precision_mode",
+                              new std::string(config_.precision_mode));
+
    if (!config_.model_dir.empty()) {
      argument.fluid_model_dir.reset(new std::string(config_.model_dir));
    } else {

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+
+void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
+  PADDLE_ENFORCE(!name_.empty(),
+                 "Need to SetName first, so that the corresponding tensor can "
+                 "be retrieved.");
+  PADDLE_ENFORCE(input_or_output_,
+                 "Can't reshape the output tensor, it is readonly");
+  PADDLE_ENFORCE(scope_);
+  auto *scope = static_cast<framework::Scope *>(scope_);
+  auto *var = scope->FindVar(name_);
+  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  tensor->Resize(framework::make_ddim(shape));
+}
+
+template <typename T>
+T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  switch (static_cast<int>(place)) {
+    case static_cast<int>(PaddlePlace::kCPU): {
+      return tensor->mutable_data<T>(platform::CPUPlace());
+    }
+    case static_cast<int>(PaddlePlace::kGPU): {
+      return tensor->mutable_data<T>(platform::CUDAPlace());
+    }
+    default:
+      PADDLE_THROW("Unsupported place: %d", static_cast<int>(place));
+      break;
+  }
+  return nullptr;
+}
+
+template <typename T>
+T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  auto *res = tensor->data<T>();
+
+  if (platform::is_cpu_place(tensor->place())) {
+    *place = PaddlePlace::kCPU;
+  } else if (platform::is_gpu_place(tensor->place())) {
+    *place = PaddlePlace::kGPU;
+  } else {
+    *place = PaddlePlace::kUNK;
+  }
+
+  *size = tensor->numel();
+  return res;
+}
+
+template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
+template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
+template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place);
+template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place);
+
+void *ZeroCopyTensor::FindTensor() const {
+  PADDLE_ENFORCE(!name_.empty(),
+                 "Need to SetName first, so that the corresponding tensor can "
+                 "be retrieved.");
+  PADDLE_ENFORCE(scope_);
+  auto *scope = static_cast<framework::Scope *>(scope_);
+  auto *var = scope->FindVar(name_);
+  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  return tensor;
+}
+
+std::vector<int64_t> ZeroCopyTensor::shape() {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  PADDLE_ENFORCE(tensor, "not found tensor called %s in the scope", name_);
+  return framework::vectorize(tensor->dims());
+}
+
+void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  framework::LoD lod;
+  for (auto &level : x) {
+    lod.emplace_back(level);
+  }
+  tensor->set_lod(lod);
+}
+
+std::vector<std::vector<size_t>> ZeroCopyTensor::lod() const {
+  std::vector<std::vector<size_t>> res;
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  for (auto &level : tensor->lod()) {
+    res.emplace_back(level);
+  }
+  return res;
+}
+
+}  // namespace paddle
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+
+void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {}
+
+template <typename T>
+T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
+  return nullptr;
+}
+
+template <typename T>
+T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
+  return nullptr;
+}
+
+template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
+template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
+template float *ZeroCopyTensor::mutable_data(PaddlePlace place);
+template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
+
+void *ZeroCopyTensor::FindTensor() const { return nullptr; }
+
+std::vector<int64_t> ZeroCopyTensor::shape() { return {}; }
+
+void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}
+
+std::vector<std::vector<size_t>> ZeroCopyTensor::lod() const {
+  return std::vector<std::vector<size_t>>();
+}
+
+}  // namespace paddle
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -21,8 +21,10 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/timer.h"
+#include "paddle/fluid/string/printf.h"

 namespace paddle {
 namespace inference {
@@ -93,6 +95,20 @@ static void TensorAssignData(PaddleTensor *tensor,
  }
 }

+template <typename T>
+static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
+                                    const std::vector<std::vector<T>> &data) {
+  int size{0};
+  auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
+  int c = 0;
+  for (const auto &f : data) {
+    for (T v : f) {
+      ptr[c++] = v;
+    }
+  }
+  return size;
+}
+
 static std::string DescribeTensor(const PaddleTensor &tensor) {
  std::stringstream os;
  os << "Tensor [" << tensor.name << "]\n";
@@ -138,5 +154,127 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
  }
 }

+template <typename T>
+std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
+  std::stringstream ss;
+  ss << "\n---- tensor ---" << '\n';
+  ss << "lod: [";
+  for (const auto &level : tensor.lod()) {
+    ss << "[ ";
+    for (auto i : level) {
+      ss << i << ", ";
+    }
+    ss << "]";
+  }
+  ss << "]\n";
+
+  ss << "shape: [";
+  int size = 1;
+  for (int i = 0; i < tensor.dims().size(); i++) {
+    int dim = tensor.dims()[i];
+    ss << dim << ", ";
+    size *= dim;
+  }
+  ss << "]\n";
+
+  ss << "data: ";
+  for (int i = 0; i < std::min(20, size); i++) {
+    ss << tensor.data<T>()[i] << " ";
+  }
+  ss << "\n";
+
+  return ss.str();
+}
+
+static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) {
+  if (a.size() != b.size()) {
+    LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(),
+                                  b.size());
+    return false;
+  }
+  for (size_t i = 0; i < a.size(); i++) {
+    auto &al = a[i];
+    auto &bl = b[i];
+    if (al.size() != bl.size()) {
+      LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(),
+                                    bl.size());
+      return false;
+    }
+  }
+  return true;
+}
+
+static bool CompareShape(const std::vector<int64_t> &a,
+                         const std::vector<int64_t> &b) {
+  if (a.size() != b.size()) {
+    LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(),
+                                  b.size());
+    return false;
+  }
+  for (size_t i = 0; i < a.size(); i++) {
+    if (a[i] != b[i]) {
+      LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i,
+                                    a[i], b[i]);
+      return false;
+    }
+  }
+  return true;
+}
+
+static bool CompareTensorData(const framework::LoDTensor &a,
+                              const framework::LoDTensor &b) {
+  auto a_shape = framework::vectorize(a.dims());
+  auto b_shape = framework::vectorize(b.dims());
+  size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+  size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+  if (a_size != b_size) {
+    LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d",
+                                  a_size, b_size);
+  }
+
+  for (size_t i = 0; i < a_size; i++) {
+    if (a.type() == typeid(float)) {
+      const auto *a_data = a.data<float>();
+      const auto *b_data = b.data<float>();
+      if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
+        LOG(ERROR) << string::Sprintf(
+            "tensor data %d-th element not match, %f != %f", i, a_data[i],
+            b_data[i]);
+        return false;
+      }
+    } else if (a.type() == typeid(int64_t)) {
+      const auto *a_data = a.data<int64_t>();
+      const auto *b_data = b.data<int64_t>();
+      if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
+        LOG(ERROR) << string::Sprintf(
+            "tensor data %d-th element not match, %f != %f", i, a_data[i],
+            b_data[i]);
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool CompareTensor(const framework::LoDTensor &a,
+                          const framework::LoDTensor &b) {
+  if (!CompareLoD(a.lod(), b.lod())) {
+    return false;
+  }
+  if (!CompareShape(framework::vectorize(a.dims()),
+                    framework::vectorize(b.dims()))) {
+    return false;
+  }
+
+  if (!CompareTensorData(a, b)) {
+    return false;
+  }
+
+  return true;
+}
+
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -101,6 +101,40 @@ struct PaddleTensor {
  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
 };

+enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
+// Tensor without copy, currently only supports AnalysisPredictor.
+class ZeroCopyTensor {
+ public:
+  void Reshape(const std::vector<int>& shape);
+
+  // Get the memory in CPU or GPU with specific data type, should Reshape first
+  // to tell the data size.
+  // Once can directly call this data to feed the data.
+  // This is for write the input tensor.
+  template <typename T>
+  T* mutable_data(PaddlePlace place);
+  // Get the memory directly, will return the place and memory size by pointer.
+  // This is for reading the output tensor.
+  template <typename T>
+  T* data(PaddlePlace* place, int* size);
+
+  std::vector<int64_t> shape();
+
+  void SetLoD(const std::vector<std::vector<size_t>>& x);
+  std::vector<std::vector<size_t>> lod() const;
+
+ protected:
+  ZeroCopyTensor(void* scope) : scope_{scope} {}
+  void SetName(const std::string& name) { name_ = name; }
+  void* FindTensor() const;
+
+ private:
+  std::string name_;
+  bool input_or_output_;
+  friend class AnalysisPredictor;
+  void* scope_{nullptr};
+};
+
 /*
 * A simple Inference API for Paddle.
 */
@@ -120,6 +154,19 @@ class PaddlePredictor {
                   std::vector<PaddleTensor>* output_data,
                   int batch_size = -1) = 0;

+  // Zero copy input and output optimization.
+  // Get the input or output tensors, and operate on their memory directly,
+  // without copy.
+  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  virtual bool ZeroCopyRun() { return false; }
+
  // Clone a predictor that share the model weights, the Cloned predictor should
  // be thread-safe.
  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
@@ -194,6 +241,14 @@ struct MixedRTConfig : public NativeConfig {
  // For workspace_size, refer it from here:
  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
  int workspace_size{1 << 30};
+  //  We transform the Ops that can be converted into TRT layer in the model,
+  //  and aggregate these Ops into subgraphs for TRT execution.
+  //  We set this variable to control the minimum number of nodes in the
+  //  subgraph, 3 as default value.
+  int minimum_subgraph_size = 3;
+  // Reserved configuration
+  // We just support "FP32" now, "FP16" and "INT8" will be supported.
+  std::string precision_mode = "FP32";
 };

 // NOTE WIP, not stable yet.
@@ -204,12 +259,18 @@ struct AnalysisConfig : public NativeConfig {
    kExclude   // Specify the disabled passes in `ir_passes`.
  };

+  // Determine whether to perform graph optimization.
  bool enable_ir_optim = true;
+  // Manually determine the IR passes to run.
  IrPassMode ir_mode{IrPassMode::kExclude};
-  // attention lstm fuse works only on some specific models, disable as default.
-  std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
+  std::vector<std::string> ir_passes;
+
+  // NOT stable yet.
+  bool use_feed_fetch_ops{true};

-  // NOTE this is just for internal development, please not use it.
+  // NOTE this is just for internal development, please not use it.	NOT
+  // stable
+  // yet.
  bool _use_mkldnn{false};
 };


--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -90,3 +90,13 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
               DEPS inference_anakin_api_shared dynload_cuda SERIAL)
   endif()
 endif()
+
+if(WITH_GPU AND TENSORRT_FOUND)
+   set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
+   if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
+       inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
+   endif()
+   cc_test(test_trt_models SRCS trt_models_tester.cc  
+     ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models
+     DEPS paddle_inference_tensorrt_subgraph_engine)
+endif()
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -18,6 +18,8 @@ namespace paddle {
 namespace inference {
 namespace analysis {

+using contrib::AnalysisConfig;
+
 struct DataRecord {
  std::vector<int64_t> data;
  std::vector<size_t> lod;
@@ -78,6 +80,7 @@ struct DataRecord {
      }
    }
  }
+
  DataRecord NextBatch() {
    DataRecord data;
    data.data = batched_datas[batch_iter];
@@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) {
  SetConfig(&cfg);

  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
  ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);

--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -16,6 +16,7 @@

 namespace paddle {
 namespace inference {
+using contrib::AnalysisConfig;

 struct DataRecord {
  std::vector<std::vector<int64_t>> word_data_all, mention_data_all;
@@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) {
  SetConfig(&cfg);

  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
  ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);

--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -12,12 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"

+DEFINE_bool(with_precision_check, true, "turn on test");
+
 namespace paddle {
 namespace inference {

 using namespace framework;  // NOLINT
+using namespace contrib;    // NOLINT

 struct DataRecord {
  std::vector<std::vector<std::vector<float>>> link_step_data_all;
@@ -29,10 +33,12 @@ struct DataRecord {
  size_t batch_iter{0};
  size_t batch_size{1};
  DataRecord() = default;
+
  explicit DataRecord(const std::string &path, int batch_size = 1)
      : batch_size(batch_size) {
    Load(path);
  }
+
  DataRecord NextBatch() {
    DataRecord data;
    size_t batch_end = batch_iter + batch_size;
@@ -101,6 +107,7 @@ struct DataRecord {
    num_samples = num_lines;
  }
 };
+
 void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
                   int batch_size) {
  PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
@@ -149,7 +156,55 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }

-void SetConfig(contrib::AnalysisConfig *cfg) {
+void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
+                           ZeroCopyTensor *cell_init_tensor,
+                           ZeroCopyTensor *data_tensor,
+                           ZeroCopyTensor *hidden_init_tensor,
+                           ZeroCopyTensor *week_tensor,
+                           ZeroCopyTensor *minute_tensor,
+                           DataRecord *data_record, int batch_size) {
+  auto one_batch = data_record->NextBatch();
+  std::vector<int> rnn_link_data_shape(
+      {static_cast<int>(one_batch.rnn_link_data.size()),
+       static_cast<int>(one_batch.rnn_link_data.front().size())});
+  lod_attention_tensor->Reshape({1, 2});
+  lod_attention_tensor->SetLoD({one_batch.lod1, one_batch.lod2});
+
+  cell_init_tensor->Reshape({batch_size, 15});
+  cell_init_tensor->SetLoD({one_batch.lod3});
+
+  hidden_init_tensor->Reshape({batch_size, 15});
+  hidden_init_tensor->SetLoD({one_batch.lod3});
+
+  data_tensor->Reshape(rnn_link_data_shape);
+  data_tensor->SetLoD({one_batch.lod1});
+
+  week_tensor->Reshape(
+      {static_cast<int>(one_batch.rnn_week_datas.size()),
+       static_cast<int>(one_batch.rnn_week_datas.front().size())});
+  week_tensor->SetLoD({one_batch.lod3});
+
+  minute_tensor->Reshape(
+      {static_cast<int>(one_batch.rnn_minute_datas.size()),
+       static_cast<int>(one_batch.rnn_minute_datas.front().size())});
+  minute_tensor->SetLoD({one_batch.lod3});
+
+  // assign data
+  float arr0[] = {0, 0};
+  std::vector<float> zeros(batch_size * 15, 0);
+  std::copy_n(arr0, 2,
+              lod_attention_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  std::copy_n(arr0, 2, data_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  std::copy_n(zeros.begin(), zeros.size(),
+              cell_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  std::copy_n(zeros.begin(), zeros.size(),
+              hidden_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  ZeroCopyTensorAssignData(data_tensor, one_batch.rnn_link_data);
+  ZeroCopyTensorAssignData(week_tensor, one_batch.rnn_week_datas);
+  ZeroCopyTensorAssignData(minute_tensor, one_batch.rnn_minute_datas);
+}
+
+void SetConfig(AnalysisConfig *cfg) {
  cfg->prog_file = FLAGS_infer_model + "/__model__";
  cfg->param_file = FLAGS_infer_model + "/param";
  cfg->use_gpu = false;
@@ -187,7 +242,9 @@ TEST(Analyzer_rnn1, fuse_statis) {
  SetConfig(&cfg);

  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
  EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
@@ -214,7 +271,229 @@ TEST(Analyzer_rnn1, multi_thread) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+}
+
+bool CompareTensors(framework::Scope &a_scope, framework::Scope &b_scope,
+                    const std::vector<std::string> &tensors) {
+  for (auto &x : tensors) {
+    auto *a_var = a_scope.FindVar(x);
+    auto *b_var = b_scope.FindVar(x);
+    if (a_var && b_var) {
+      if (a_var->Type() == typeid(framework::LoDTensor) ||
+          a_var->Type() == typeid(framework::Tensor)) {
+        LOG(INFO) << "comparing tensor " << x;
+        auto &a_t = a_var->Get<framework::LoDTensor>();
+        auto &b_t = b_var->Get<framework::LoDTensor>();
+        if (!inference::CompareTensor(a_t, b_t)) {
+          LOG(ERROR) << string::Sprintf("tensor %s not match in two scopes", x);
+        }
+      } else {
+        LOG(INFO) << "skip no tensor " << x;
+      }
+    } else {
+      LOG(INFO) << "skip tensor " << x;
+    }
+  }
+  return true;
+}
+
+// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
+// on the complex RNN1 model.
+TEST(Analyzer_rnn1, ZeroCopy) {
+  AnalysisConfig config;
+  SetConfig(&config);
+  config.use_feed_fetch_ops = false;
+
+  PaddlePlace place;
+  int output_size{0};
+
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+
+  config.use_feed_fetch_ops = true;
+  auto native_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  config.use_feed_fetch_ops = true;  // the analysis predictor needs feed/fetch.
+  auto analysis_predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+
+#define NEW_TENSOR(name__) \
+  auto name__##_tensor = predictor->GetInputTensor(#name__);
+  NEW_TENSOR(data_lod_attention);
+  NEW_TENSOR(cell_init);
+  NEW_TENSOR(data);
+  NEW_TENSOR(week);
+  NEW_TENSOR(minute);
+  NEW_TENSOR(hidden_init);
+
+  // Prepare data for AnalysisPredictor
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(),
+                        data_tensor.get(), hidden_init_tensor.get(),
+                        week_tensor.get(), minute_tensor.get(), &data,
+                        FLAGS_batch_size);
+
+  // Prepare data for NativePredictor
+  std::vector<std::vector<PaddleTensor>> native_inputs;
+  SetInput(&native_inputs);
+  std::vector<PaddleTensor> native_outputs;
+  std::vector<PaddleTensor> analysis_outputs;
+
+  auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1");
+  // Run analysis predictor
+
+  int num_ops;
+  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  ASSERT_EQ(fuse_statis.at("fc_fuse"), 1);
+  ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
+  ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
+  ASSERT_EQ(num_ops,
+            13);  // After graph optimization, only 13 operators exists.
+
+  Timer timer;
+  double total_time{0};
+  double native_total_time{0};
+  double analysis_total_time{0.};
+
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    timer.tic();
+    predictor->ZeroCopyRun();
+    total_time += timer.toc();
+  }
+
+  auto *output_data = output_tensor->data<float>(&place, &output_size);
+  ASSERT_GT(output_size, 0);  // more than one output!
+
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    // Run native predictor.
+    timer.tic();
+    ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
+    native_total_time += timer.toc();
+  }
+
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    timer.tic();
+    ASSERT_TRUE(
+        analysis_predictor->Run(native_inputs.front(), &analysis_outputs));
+    analysis_total_time += timer.toc();
+  }
+
+  if (!FLAGS_with_precision_check) {
+    return;
+  }
+  int native_output_size = VecReduceToInt(native_outputs.front().shape);
+
+  EXPECT_EQ(native_output_size, output_size);
+
+  // Compare tensors between analysis and zerocopy
+  auto *p0 = static_cast<AnalysisPredictor *>(predictor.get());
+  auto *p1 = static_cast<AnalysisPredictor *>(analysis_predictor.get());
+  auto *p2 = static_cast<NativePaddlePredictor *>(native_predictor.get());
+
+  std::vector<std::string> tensor_names;
+  for (auto &var_desc : p0->program().Block(0).AllVars()) {
+    tensor_names.push_back(var_desc->Name());
+  }
+
+  LOG(INFO) << "Comparing tensors";
+  ASSERT_TRUE(
+      CompareTensors(*p0->scope(), *p1->scope(), {"final_output.tmp_1"}));
+  ASSERT_TRUE(
+      CompareTensors(*p0->scope(), *p2->scope(), {"final_output.tmp_1"}));
+
+  LOG(INFO) << "output1 " << inference::LoDTensorSummary<float>(
+                                 p0->scope()
+                                     ->FindVar("final_output.tmp_1")
+                                     ->Get<framework::LoDTensor>());
+  LOG(INFO) << "output2 " << inference::LoDTensorSummary<float>(
+                                 p1->scope()
+                                     ->FindVar("final_output.tmp_1")
+                                     ->Get<framework::LoDTensor>());
+  LOG(INFO) << "output3 " << inference::LoDTensorSummary<float>(
+                                 p2->scope()
+                                     ->FindVar("final_output.tmp_1")
+                                     ->Get<framework::LoDTensor>());
+
+  for (int i = 0; i < output_size; i++) {
+    LOG(INFO) << output_data[i] << " "
+              << static_cast<float *>(native_outputs.front().data.data())[i]
+              << " "
+              << static_cast<float *>(analysis_outputs.front().data.data())[i];
+    EXPECT_NEAR(output_data[i],
+                static_cast<float *>(native_outputs.front().data.data())[i],
+                1e-3);
+  }
+
+  LOG(INFO) << "batch_size: " << FLAGS_batch_size;
+
+  LOG(INFO) << "zero average time: "
+            << total_time / (FLAGS_repeat * FLAGS_batch_size);
+  LOG(INFO) << "analysis average time: "
+            << analysis_total_time / (FLAGS_repeat * FLAGS_batch_size);
+  LOG(INFO) << "native average time: "
+            << native_total_time / (FLAGS_repeat * FLAGS_batch_size);
+}
+
+TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
+  AnalysisConfig config;
+  SetConfig(&config);
+  config.use_feed_fetch_ops = false;
+
+#define NEW_TENSOR(name__) \
+  auto name__##_tensor = predictor->GetInputTensor(#name__);
+
+  auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  double total_time_of_threads{0};
+  std::vector<std::thread> threads;
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
+    predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
+  }
+
+  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
+    threads.emplace_back([config, &total_time_of_threads, &predictors, tid] {
+      // auto predictor = base_predictor->Clone();
+      auto &predictor = predictors[tid];
+      NEW_TENSOR(data_lod_attention);
+      NEW_TENSOR(cell_init);
+      NEW_TENSOR(data);
+      NEW_TENSOR(week);
+      NEW_TENSOR(minute);
+      NEW_TENSOR(hidden_init);
+
+      // Prepare data for AnalysisPredictor
+      DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+      Timer timer;
+      double total_time{0};
+
+      for (int i = 0; i < FLAGS_repeat; i++) {
+        PrepareZeroCopyInputs(data_lod_attention_tensor.get(),
+                              cell_init_tensor.get(), data_tensor.get(),
+                              hidden_init_tensor.get(), week_tensor.get(),
+                              minute_tensor.get(), &data, FLAGS_batch_size);
+
+        timer.tic();
+        predictor->ZeroCopyRun();
+        total_time += timer.toc();
+      }
+
+      total_time_of_threads += total_time;
+
+      LOG(INFO) << "thread time: " << total_time / FLAGS_repeat;
+    });
+  }
+
+  for (auto &t : threads) {
+    t.join();
+  }
+
+  LOG(INFO) << "average time: "
+            << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat;
 }

 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  GetFuseStatis(predictor.get(), &num_ops);
 }

 // Compare result of NativeConfig and AnalysisConfig

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 namespace analysis {
+using contrib::AnalysisConfig;

 struct Record {
  std::vector<float> data;
@@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
  int num_ops;
-  GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  GetFuseStatis(predictor.get(), &num_ops);
 }

 // Compare result of NativeConfig and AnalysisConfig

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -86,11 +86,9 @@ std::unique_ptr<PaddlePredictor> CreateTestPredictor(

 size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }

-std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config,
+std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
                                                   int *num_ops) {
-  auto predictor = CreateTestPredictor(config);
-  AnalysisPredictor *analysis_predictor =
-      dynamic_cast<AnalysisPredictor *>(predictor.get());
+  auto *analysis_predictor = static_cast<AnalysisPredictor *>(predictor);
  auto &fuse_statis = analysis_predictor->analysis_argument()
                          .Get<std::unordered_map<std::string, int>>(
                              framework::ir::kFuseStatisAttr);

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+using paddle::contrib::MixedRTConfig;
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+NativeConfig GetConfigNative() {
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname;
+  // LOG(INFO) << "dirname  " << config.model_dir;
+  config.fraction_of_gpu_memory = 0.45;
+  config.use_gpu = true;
+  config.device = 0;
+  return config;
+}
+
+MixedRTConfig GetConfigTRT() {
+  MixedRTConfig config;
+  config.model_dir = FLAGS_dirname;
+  config.use_gpu = true;
+  config.fraction_of_gpu_memory = 0.2;
+  config.device = 0;
+  config.max_batch_size = 3;
+  return config;
+}
+
+void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
+  NativeConfig config0 = GetConfigNative();
+  config0.model_dir = model_dirname;
+
+  MixedRTConfig config1 = GetConfigTRT();
+  config1.model_dir = model_dirname;
+  config1.max_batch_size = batch_size;
+
+  auto predictor0 =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 =
+      CreatePaddlePredictor<MixedRTConfig,
+                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
+  // Prepare inputs
+  int height = 224;
+  int width = 224;
+  float *data = new float[batch_size * 3 * height * width];
+  memset(data, 0, sizeof(float) * (batch_size * 3 * height * width));
+  data[0] = 1.0f;
+
+  // Prepare inputs
+  PaddleTensor tensor;
+  tensor.name = "input_0";
+  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+  tensor.data = PaddleBuf(static_cast<void *>(data),
+                          sizeof(float) * (batch_size * 3 * height * width));
+  tensor.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+  // Prepare outputs
+  std::vector<PaddleTensor> outputs0;
+  std::vector<PaddleTensor> outputs1;
+  CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
+
+  CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
+
+  // Get output.
+  ASSERT_EQ(outputs0.size(), 1UL);
+  ASSERT_EQ(outputs1.size(), 1UL);
+
+  const size_t num_elements = outputs0.front().data.length() / sizeof(float);
+  const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
+  EXPECT_EQ(num_elements, num_elements1);
+
+  auto *data0 = static_cast<float *>(outputs0.front().data.data());
+  auto *data1 = static_cast<float *>(outputs1.front().data.data());
+
+  ASSERT_GT(num_elements, 0UL);
+  for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+    EXPECT_NEAR(data0[i], data1[i], 1e-3);
+  }
+}
+
+TEST(trt_models_test, main) {
+  std::vector<std::string> infer_models = {"mobilenet", "resnet50",
+                                           "resnext50"};
+  for (auto &model_dir : infer_models) {
+    CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir);
+  }
+}
+}  // namespace paddle
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -36,6 +36,8 @@ namespace memory {
 using BuddyAllocator = detail::BuddyAllocator;

 BuddyAllocator* GetCPUBuddyAllocator() {
+  // We tried thread_local for inference::RNN1 model, but that not works much
+  // for multi-thread test.
  static std::once_flag init_flag;
  static detail::BuddyAllocator* a = nullptr;

@@ -48,6 +50,25 @@ BuddyAllocator* GetCPUBuddyAllocator() {
  return a;
 }

+// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation,
+// seems they are almost the same overhead.
+struct NaiveAllocator {
+  void* Alloc(size_t size) { return malloc(size); }
+
+  void Free(void* p) {
+    PADDLE_ENFORCE(p);
+    free(p);
+  }
+
+  static NaiveAllocator* Instance() {
+    static NaiveAllocator x;
+    return &x;
+  }
+
+ private:
+  std::mutex lock_;
+};
+
 template <>
 void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);

--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -36,11 +36,16 @@ class AucOp : public framework::OperatorWithKernel {
                      "Out and Label should have same height.");

    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
+    int slide_steps = ctx->Attrs().Get<int>("slide_steps");
+
+    PADDLE_ENFORCE_GE(num_pred_buckets, 1, "num_thresholds must larger than 1");
+    PADDLE_ENFORCE_GE(slide_steps, 0, "slide_steps must be natural number");

    ctx->SetOutputDim("AUC", {1});
-    ctx->SetOutputDim("BatchAUC", {1});
-    ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
-    ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
+
+    slide_steps = slide_steps == 0 ? 1 : slide_steps;
+    ctx->SetOutputDim("StatPosOut", {slide_steps, num_pred_buckets});
+    ctx->SetOutputDim("StatNegOut", {slide_steps, num_pred_buckets});
  }

 protected:
@@ -62,6 +67,7 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Label",
             "A 2D int tensor indicating the label of the training data. "
             "shape: [batch_size, 1]");
+
    // TODO(typhoonzero): support weight input
    AddInput("StatPos", "Statistic value when label = 1");
    AddInput("StatNeg", "Statistic value when label = 0");
@@ -69,18 +75,19 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("AUC",
              "A scalar representing the "
              "current area-under-the-curve.");
-    AddOutput("BatchAUC", "The AUC for current batch");
+
    AddOutput("StatPosOut", "Statistic value when label = 1");
    AddOutput("StatNegOut", "Statistic value when label = 0");

    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
        .SetDefault("ROC");

-    AddAttr<int>("num_thresholds",
-                 "The number of thresholds to use when discretizing the"
-                 " roc curve.")
+    AddAttr<int>(
+        "num_thresholds",
+        "The number of thresholds to use when discretizing the roc curve.")
        .SetDefault((2 << 12) - 1);
-
+    AddAttr<int>("slide_steps", "Use slide steps to calc batch auc.")
+        .SetDefault(1);
    AddComment(R"DOC(
 Area Under The Curve (AUC) Operator.


--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -32,7 +32,9 @@ class AucKernel : public framework::OpKernel<T> {

    std::string curve = ctx.Attr<std::string>("curve");
    int num_thresholds = ctx.Attr<int>("num_thresholds");
+    // buckets contain numbers from 0 to num_thresholds
    int num_pred_buckets = num_thresholds + 1;
+    int slide_steps = ctx.Attr<int>("slide_steps");

    // Only use output var for now, make sure it's persistable and
    // not cleaned up for each batch.
@@ -40,16 +42,19 @@ class AucKernel : public framework::OpKernel<T> {
    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");

-    auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
-    auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
-    calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
-            auc);
+    auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
+    auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());

-    auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
-    std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
-    std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
-    calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
-            num_thresholds, batch_auc);
+    std::vector<int64_t> stat_pos_data(num_pred_buckets, 0);
+    std::vector<int64_t> stat_neg_data(num_pred_buckets, 0);
+
+    auto stat_pos_calc = stat_pos_data.data();
+    auto stat_neg_calc = stat_neg_data.data();
+
+    statAuc(label, predict, num_pred_buckets, num_thresholds, slide_steps,
+            origin_stat_pos, origin_stat_neg, &stat_pos_calc, &stat_neg_calc);
+
+    calcAuc(ctx, stat_pos_calc, stat_neg_calc, num_thresholds, auc);
  }

 private:
@@ -58,29 +63,76 @@ class AucKernel : public framework::OpKernel<T> {
    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
  }

-  inline static void calcAuc(const framework::ExecutionContext &ctx,
-                             const framework::Tensor *label,
+  inline static void statAuc(const framework::Tensor *label,
                             const framework::Tensor *predict,
-                             int64_t *stat_pos, int64_t *stat_neg,
-                             int num_thresholds,
-                             framework::Tensor *auc_tensor) {
+                             const int num_pred_buckets,
+                             const int num_thresholds, const int slide_steps,
+                             int64_t *origin_stat_pos, int64_t *origin_stat_neg,
+                             int64_t **stat_pos, int64_t **stat_neg) {
    size_t batch_size = predict->dims()[0];
    size_t inference_width = predict->dims()[1];
    const T *inference_data = predict->data<T>();
    const auto *label_data = label->data<int64_t>();

-    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
-
    for (size_t i = 0; i < batch_size; i++) {
      uint32_t binIdx = static_cast<uint32_t>(
          inference_data[i * inference_width + 1] * num_thresholds);
      if (label_data[i]) {
-        stat_pos[binIdx] += 1.0;
+        (*stat_pos)[binIdx] += 1.0;
      } else {
-        stat_neg[binIdx] += 1.0;
+        (*stat_neg)[binIdx] += 1.0;
      }
    }

+    int bucket_length = num_pred_buckets * sizeof(int64_t);
+
+    // will stat auc unlimited.
+    if (slide_steps == 0) {
+      for (int slide = 0; slide < num_pred_buckets; ++slide) {
+        origin_stat_pos[slide] += (*stat_pos)[slide];
+        origin_stat_neg[slide] += (*stat_neg)[slide];
+      }
+
+      *stat_pos = origin_stat_pos;
+      *stat_neg = origin_stat_neg;
+
+    } else {
+      for (int slide = 1; slide < slide_steps; ++slide) {
+        int dst_idx = (slide - 1) * num_pred_buckets;
+        int src_inx = slide * num_pred_buckets;
+        std::memcpy(origin_stat_pos + dst_idx, origin_stat_pos + src_inx,
+                    bucket_length);
+        std::memcpy(origin_stat_neg + dst_idx, origin_stat_neg + src_inx,
+                    bucket_length);
+      }
+
+      std::memcpy(origin_stat_pos + (slide_steps - 1) * num_pred_buckets,
+                  *stat_pos, bucket_length);
+      std::memcpy(origin_stat_neg + (slide_steps - 1) * num_pred_buckets,
+                  *stat_neg, bucket_length);
+
+      std::memset(*stat_pos, 0, bucket_length);
+      std::memset(*stat_neg, 0, bucket_length);
+
+      for (int slide = 0; slide < num_pred_buckets; ++slide) {
+        int stat_pos_steps = 0;
+        int stat_neg_steps = 0;
+        for (int step = 0; step < slide_steps; ++step) {
+          stat_pos_steps += origin_stat_pos[slide + step * num_pred_buckets];
+          stat_neg_steps += origin_stat_neg[slide + step * num_pred_buckets];
+        }
+        (*stat_pos)[slide] += stat_pos_steps;
+        (*stat_neg)[slide] += stat_neg_steps;
+      }
+    }
+  }
+
+  inline static void calcAuc(const framework::ExecutionContext &ctx,
+                             int64_t *stat_pos, int64_t *stat_neg,
+                             int num_thresholds,
+                             framework::Tensor *auc_tensor) {
+    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
+
    *auc = 0.0f;

    double totPos = 0.0;
@@ -96,7 +148,6 @@ class AucKernel : public framework::OpKernel<T> {
      totPos += stat_pos[idx];
      totNeg += stat_neg[idx];
      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
-
      --idx;
    }


--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -30,7 +30,13 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
 polygon_box_transform_op.cu)
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
-detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
+
+if(WITH_GPU)
+  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
+else()
+  detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
+endif()
+
 detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
 #Export local libraries to parent
 set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"

@@ -69,7 +70,7 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()),
-        platform::CPUPlace());
+        ctx.device_context());
  }
 };

@@ -162,7 +163,7 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
  const T *im_info_data = im_info.data<T>();
  T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
  T im_scale = im_info_data[2];
-  keep->Resize({boxes->dims()[0], 1});
+  keep->Resize({boxes->dims()[0]});
  min_size = std::max(min_size, 1.0f);
  int *keep_data = keep->mutable_data<int>(ctx.GetPlace());

@@ -463,7 +464,7 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("post_nms_topN", "post_nms_topN");
    AddAttr<float>("nms_thresh", "nms_thres");
    AddAttr<float>("min_size", "min size");
-    AddAttr<float>("eta", "eta");
+    AddAttr<float>("eta", "The parameter for adaptive NMS.");
    AddComment(R"DOC(
 Generate Proposals OP


--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+namespace {
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+int const kThreadsPerBlock = sizeof(uint64_t) * 8;
+
+template <typename T>
+__global__ void RangeInitKernel(const T start, const T delta, const int size,
+                                T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
+}
+
+template <typename T>
+void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
+                    Tensor *value_out, Tensor *index_out) {
+  int num = value.numel();
+  Tensor index_in_t;
+  int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
+  int block = 512;
+  auto stream = ctx.stream();
+  RangeInitKernel<<<DIVUP(num, block), block, 0, stream>>>(0, 1, num, idx_in);
+  int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace());
+
+  const T *keys_in = value.data<T>();
+  T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace());
+
+  // Determine temporary device storage requirements
+  void *d_temp_storage = NULL;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceRadixSort::SortPairsDescending<T, int>(
+      d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
+      num);
+
+  // Allocate temporary storage
+  auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+  d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+
+  // Run sorting operation
+  cub::DeviceRadixSort::SortPairsDescending<T, int>(
+      d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
+      num);
+
+  memory::Free(place, d_temp_storage);
+}
+
+template <typename T>
+__device__ __forceinline__ T Min(T x, T y) {
+  return x < y ? x : y;
+}
+
+template <typename T>
+__device__ __forceinline__ T Max(T x, T y) {
+  return x > y ? x : y;
+}
+
+template <typename T>
+__global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
+                                       const T *var, const int *index,
+                                       const T *im_info, const int num,
+                                       T *proposals) {
+  T kBBoxClipDefault = log(1000.0 / 16.0);
+  CUDA_1D_KERNEL_LOOP(i, num) {
+    int k = index[i] * 4;
+    T axmin = anchor[k];
+    T aymin = anchor[k + 1];
+    T axmax = anchor[k + 2];
+    T aymax = anchor[k + 3];
+
+    T w = axmax - axmin + 1.0;
+    T h = aymax - aymin + 1.0;
+    T cx = axmin + 0.5 * w;
+    T cy = aymin + 0.5 * h;
+
+    T dxmin = deltas[k];
+    T dymin = deltas[k + 1];
+    T dxmax = deltas[k + 2];
+    T dymax = deltas[k + 3];
+
+    T d_cx = 0., d_cy = 0., d_w = 0., d_h = 0.;
+    if (var) {
+      d_cx = cx + dxmin * w * var[k];
+      d_cy = cy + dymin * h * var[k + 1];
+      d_w = exp(Min<T>(dxmax * var[k + 2], kBBoxClipDefault)) * w;
+      d_h = exp(Min<T>(dymax * var[k + 3], kBBoxClipDefault)) * h;
+    } else {
+      d_cx = cx + dxmin * w;
+      d_cy = cy + dymin * h;
+      d_w = exp(Min<T>(dxmax, kBBoxClipDefault)) * w;
+      d_h = exp(Min<T>(dymax, kBBoxClipDefault)) * h;
+    }
+
+    T oxmin = d_cx - d_w * 0.5;
+    T oymin = d_cy - d_h * 0.5;
+    T oxmax = d_cx + d_w * 0.5 - 1.;
+    T oymax = d_cy + d_h * 0.5 - 1.;
+
+    proposals[i * 4] = Max<T>(Min<T>(oxmin, im_info[1] - 1.), 0.);
+    proposals[i * 4 + 1] = Max<T>(Min<T>(oymin, im_info[0] - 1.), 0.);
+    proposals[i * 4 + 2] = Max<T>(Min<T>(oxmax, im_info[1] - 1.), 0.);
+    proposals[i * 4 + 3] = Max<T>(Min<T>(oymax, im_info[0] - 1.), 0.);
+  }
+}
+
+template <typename T, int BlockSize>
+__global__ void FilterBBoxes(const T *bboxes, const T *im_info,
+                             const T min_size, const int num, int *keep_num,
+                             int *keep) {
+  T im_h = im_info[0];
+  T im_w = im_info[1];
+  T im_scale = im_info[2];
+
+  int cnt = 0;
+  __shared__ int keep_index[BlockSize];
+
+  CUDA_1D_KERNEL_LOOP(i, num) {
+    keep_index[threadIdx.x] = -1;
+    __syncthreads();
+
+    int k = i * 4;
+    T xmin = bboxes[k];
+    T ymin = bboxes[k + 1];
+    T xmax = bboxes[k + 2];
+    T ymax = bboxes[k + 3];
+
+    T w = xmax - xmin + 1.0;
+    T h = ymax - ymin + 1.0;
+    T cx = xmin + w / 2.;
+    T cy = ymin + h / 2.;
+
+    T w_s = (xmax - xmin) / im_scale + 1.;
+    T h_s = (ymax - ymin) / im_scale + 1.;
+
+    if (w_s >= min_size && h_s >= min_size && cx <= im_w && cy <= im_h) {
+      keep_index[threadIdx.x] = i;
+    }
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      int size = (num - i) < BlockSize ? num - i : BlockSize;
+      for (int j = 0; j < size; ++j) {
+        if (keep_index[j] > -1) {
+          keep[cnt++] = keep_index[j];
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (threadIdx.x == 0) {
+    keep_num[0] = cnt;
+  }
+}
+
+__device__ inline float IoU(const float *a, const float *b) {
+  float left = max(a[0], b[0]), right = min(a[2], b[2]);
+  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  float inter_s = width * height;
+  float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+  float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  return inter_s / (s_a + s_b - inter_s);
+}
+
+__global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
+                          const float *dev_boxes, uint64_t *dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  const int row_size =
+      min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock);
+
+  __shared__ float block_boxes[kThreadsPerBlock * 4];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 4 + 0] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0];
+    block_boxes[threadIdx.x * 4 + 1] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1];
+    block_boxes[threadIdx.x * 4 + 2] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2];
+    block_boxes[threadIdx.x * 4 + 3] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x;
+    const float *cur_box = dev_boxes + cur_box_idx * 4;
+    int i = 0;
+    uint64_t t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+template <typename T>
+void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
+         const Tensor &sorted_indices, const T nms_threshold,
+         Tensor *keep_out) {
+  int boxes_num = proposals.dims()[0];
+  PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]);
+
+  const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);
+  dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock),
+              DIVUP(boxes_num, kThreadsPerBlock));
+  dim3 threads(kThreadsPerBlock);
+
+  const T *boxes = proposals.data<T>();
+  auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+  int size_bytes = boxes_num * col_blocks * sizeof(uint64_t);
+  uint64_t *d_mask =
+      reinterpret_cast<uint64_t *>(memory::Alloc(place, size_bytes));
+  NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes, d_mask);
+  uint64_t *h_mask = reinterpret_cast<uint64_t *>(
+      memory::Alloc(platform::CPUPlace(), size_bytes));
+  memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0);
+
+  std::vector<uint64_t> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
+
+  std::vector<int> keep_vec;
+  int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / kThreadsPerBlock;
+    int inblock = i % kThreadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      ++num_to_keep;
+      keep_vec.push_back(i);
+      uint64_t *p = &h_mask[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+  int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace());
+  memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(),
+               sizeof(int) * num_to_keep, 0);
+  memory::Free(place, d_mask);
+  memory::Free(platform::CPUPlace(), h_mask);
+}
+
+template <typename T>
+std::pair<Tensor, Tensor> ProposalForOneImage(
+    const platform::CUDADeviceContext &ctx, const Tensor &im_info,
+    const Tensor &anchors, const Tensor &variances,
+    const Tensor &bbox_deltas,  // [M, 4]
+    const Tensor &scores,       // [N, 1]
+    int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
+    float eta) {
+  // 1. pre nms
+  Tensor scores_sort, index_sort;
+  SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
+  int num = scores.numel();
+  int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
+                                                                : pre_nms_top_n;
+  scores_sort.Resize({pre_nms_num, 1});
+  index_sort.Resize({pre_nms_num, 1});
+
+  // 2. box decode and clipping
+  Tensor proposals;
+  proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
+  int block = 512;
+  auto stream = ctx.stream();
+  BoxDecodeAndClipKernel<T><<<DIVUP(pre_nms_num, block), block, 0, stream>>>(
+      anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
+      index_sort.data<int>(), im_info.data<T>(), pre_nms_num,
+      proposals.data<T>());
+
+  // 3. filter
+  Tensor keep_index, keep_num_t;
+  keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
+  keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
+  min_size = std::max(min_size, 1.0f);
+  FilterBBoxes<T, 512><<<1, 512, 0, stream>>>(
+      proposals.data<T>(), im_info.data<T>(), min_size, pre_nms_num,
+      keep_num_t.data<int>(), keep_index.data<int>());
+  int keep_num;
+  const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+  memory::Copy(platform::CPUPlace(), &keep_num, gpu_place,
+               keep_num_t.data<int>(), sizeof(int), 0);
+  keep_index.Resize({keep_num});
+
+  Tensor scores_filter, proposals_filter;
+  proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
+  scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
+  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+
+  if (nms_thresh <= 0) {
+    return std::make_pair(proposals_filter, scores_filter);
+  }
+
+  // 4. nms
+  Tensor keep_nms;
+  NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
+  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
+    keep_nms.Resize({post_nms_top_n});
+  }
+
+  Tensor scores_nms, proposals_nms;
+  proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
+  scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
+  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+
+  return std::make_pair(proposals_nms, scores_nms);
+}
+}  // namespace
+
+template <typename DeviceContext, typename T>
+class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *scores = context.Input<Tensor>("Scores");
+    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
+    auto *im_info = context.Input<Tensor>("ImInfo");
+    auto *anchors = context.Input<Tensor>("Anchors");
+    auto *variances = context.Input<Tensor>("Variances");
+
+    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
+    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
+
+    int pre_nms_top_n = context.Attr<int>("pre_nms_topN");
+    int post_nms_top_n = context.Attr<int>("post_nms_topN");
+    float nms_thresh = context.Attr<float>("nms_thresh");
+    float min_size = context.Attr<float>("min_size");
+    float eta = context.Attr<float>("eta");
+    PADDLE_ENFORCE_GE(eta, 1., "Not support adaptive NMS.");
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+
+    auto scores_dim = scores->dims();
+    int64_t num = scores_dim[0];
+    int64_t c_score = scores_dim[1];
+    int64_t h_score = scores_dim[2];
+    int64_t w_score = scores_dim[3];
+
+    auto bbox_dim = bbox_deltas->dims();
+    int64_t c_bbox = bbox_dim[1];
+    int64_t h_bbox = bbox_dim[2];
+    int64_t w_bbox = bbox_dim[3];
+
+    Tensor bbox_deltas_swap, scores_swap;
+    bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
+                                     dev_ctx.GetPlace());
+    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
+                                dev_ctx.GetPlace());
+
+    math::Transpose<DeviceContext, T, 4> trans;
+    std::vector<int> axis = {0, 2, 3, 1};
+    trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
+    trans(dev_ctx, *scores, &scores_swap, axis);
+
+    Tensor *anchor = const_cast<framework::Tensor *>(anchors);
+    anchor->Resize({anchors->numel() / 4, 4});
+    Tensor *var = const_cast<framework::Tensor *>(variances);
+    var->Resize({var->numel() / 4, 4});
+
+    rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
+                              context.GetPlace());
+    rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
+
+    T *rpn_rois_data = rpn_rois->data<T>();
+    T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
+
+    auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
+
+    int64_t num_proposals = 0;
+    std::vector<size_t> offset(1, 0);
+    for (int64_t i = 0; i < num; ++i) {
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+
+      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
+      scores_slice.Resize({h_score * w_score * c_score, 1});
+
+      std::pair<Tensor, Tensor> box_score_pair =
+          ProposalForOneImage<T>(dev_ctx, im_info_slice, *anchor, *var,
+                                 bbox_deltas_slice, scores_slice, pre_nms_top_n,
+                                 post_nms_top_n, nms_thresh, min_size, eta);
+
+      Tensor proposals = box_score_pair.first;
+      Tensor scores = box_score_pair.second;
+
+      memory::Copy(place, rpn_rois_data + num_proposals * 4, place,
+                   proposals.data<T>(), sizeof(T) * proposals.numel(), 0);
+      memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
+                   scores.data<T>(), sizeof(T) * scores.numel(), 0);
+      num_proposals += proposals.dims()[0];
+      offset.emplace_back(num_proposals);
+    }
+    framework::LoD lod;
+    lod.emplace_back(offset);
+    rpn_rois->set_lod(lod);
+    rpn_roi_probs->set_lod(lod);
+    rpn_rois->Resize({num_proposals, 4});
+    rpn_roi_probs->Resize({num_proposals, 1});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(generate_proposals,
+                        ops::CUDAGenerateProposalsKernel<
+                            paddle::platform::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
    int class_num = ctx.Attr<int>("class_num");

-    auto label_lod = in_label->lod();
-    auto detect_lod = in_detect->lod();
+    auto& label_lod = in_label->lod();
+    auto& detect_lod = in_detect->lod();
    PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
                      "Only support one level sequence now.");
    PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto labels = framework::EigenTensor<T, 2>::From(input_label);
    auto detect = framework::EigenTensor<T, 2>::From(input_detect);

-    auto label_lod = input_label.lod();
-    auto detect_lod = input_detect.lod();
+    auto& label_lod = input_label.lod();
+    auto& detect_lod = input_detect.lod();

    int batch_size = label_lod[0].size() - 1;
-    auto label_index = label_lod[0];
+    auto& label_index = label_lod[0];

    for (int n = 0; n < batch_size; ++n) {
      std::map<int, std::vector<Box>> boxes;
@@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {

    output_true_pos->set_lod(true_pos_lod);
    output_false_pos->set_lod(false_pos_lod);
-    return;
  }

  void GetInputPos(const framework::Tensor& input_pos_count,
@@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto SetData = [](const framework::LoDTensor& pos_tensor,
                      std::map<int, std::vector<std::pair<T, int>>>& pos) {
      const T* pos_data = pos_tensor.data<T>();
-      auto pos_data_lod = pos_tensor.lod()[0];
+      auto& pos_data_lod = pos_tensor.lod()[0];
      for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
        for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
          T score = pos_data[j * 2];
@@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
      std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
    int batch_size = gt_boxes.size();
    for (int n = 0; n < batch_size; ++n) {
-      auto image_gt_boxes = gt_boxes[n];
-      for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) {
+      auto& image_gt_boxes = gt_boxes[n];
+      for (auto& image_gt_box : image_gt_boxes) {
        size_t count = 0;
-        auto labeled_bboxes = it->second;
+        auto& labeled_bboxes = image_gt_box.second;
        if (evaluate_difficult) {
          count = labeled_bboxes.size();
        } else {
-          for (size_t i = 0; i < labeled_bboxes.size(); ++i)
-            if (!(labeled_bboxes[i].is_difficult)) ++count;
+          for (auto& box : labeled_bboxes) {
+            if (!box.is_difficult) {
+              ++count;
+            }
+          }
        }
        if (count == 0) {
          continue;
        }
-        int label = it->first;
+        int label = image_gt_box.first;
        if (label_pos_count->find(label) == label_pos_count->end()) {
          (*label_pos_count)[label] = count;
        } else {

--- a/paddle/fluid/operators/extract_rows_op.cc
+++ b/paddle/fluid/operators/extract_rows_op.cc
@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
    auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
    auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();

-    auto in_rows = in.rows();
+    auto &in_rows = in.rows();
    auto out_dim = framework::make_ddim(
        std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
    auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());

--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -76,12 +76,18 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
  PADDLE_ENFORCE_EQ(b_dims[0], 1,
                    "The first dimension of Input(Bias) should be 1.");
-  PADDLE_ENFORCE_EQ(
-      b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
-      "The second dimension of Input(Bias) should be "
-      "7 * %d if enable peepholes connection or"
-      "4 * %d if disable peepholes",
-      frame_size, frame_size);
+  if (ctx->Attrs().Get<bool>("use_peepholes")) {
+    PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
+                      "The second dimension of Input(Bias) should be "
+                      "7 * %d if enable peepholes connection",
+                      frame_size);
+    ctx->SetOutputDim("CheckedCell", {2, frame_size});
+  } else {
+    PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                      "The second dimension of Input(Bias) should be "
+                      "4 * %d if disable peepholes",
+                      frame_size);
+  }

  framework::DDim out_dims({x_dims[0], frame_size});
  ctx->SetOutputDim("Hidden", out_dims);
@@ -173,6 +179,8 @@ void FusionLSTMOpMaker::Make() {
  AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
  AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
  AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
+  AddOutput("CheckedCell", "(Tensor) (2 x D) only for peephole.")
+      .AsIntermediate();
  AddAttr<bool>("use_peepholes",
                "(bool, defalut: True) "
                "whether to enable diagonal/peephole connections.")
@@ -250,19 +258,19 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
  const int D3 = D * 3;                  \
  const int D4 = wh_dims[1];

-#define INIT_BASE_INPUT_DATAS                                        \
-  const T* x_data = x->data<T>();                                    \
-  const T* wx_data = wx->data<T>();                                  \
-  const T* wh_data = wh->data<T>();                                  \
-  /* diagonal weight*/                                               \
-  const T* wc_data = bias->data<T>() + D4;                           \
-  /* for peephole only*/                                             \
-  Tensor checked_cell;                                               \
-  T* checked_cell_data = nullptr;                                    \
-  auto place = ctx.GetPlace();                                       \
-  if (use_peepholes) {                                               \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                 \
-    checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \
+#define INIT_BASE_INPUT_DATAS                                 \
+  const T* x_data = x->data<T>();                             \
+  const T* wx_data = wx->data<T>();                           \
+  const T* wh_data = wh->data<T>();                           \
+  /* diagonal weight*/                                        \
+  const T* wc_data = bias->data<T>() + D4;                    \
+  /* for peephole only*/                                      \
+  T* checked_cell_data = nullptr;                             \
+  auto place = ctx.GetPlace();                                \
+  if (use_peepholes) {                                        \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/          \
+    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");   \
+    checked_cell_data = checked_cell->mutable_data<T>(place); \
  }

 /// Compute LSTM

--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());

      // TODO(yuyang18): Strange code here.
-      memory::Copy(platform::CPUPlace(),
-                   new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
-                   ids_data, ids_num * sizeof(int64_t), stream);
-
+      memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
+                   gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
      d_table->set_rows(new_rows);

      auto *d_table_value = d_table->mutable_value();

--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -60,11 +60,9 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
    auto out_place = context.GetPlace();
    PADDLE_ENFORCE(platform::is_gpu_place(out_place));

-    memory::Copy(
-        boost::get<platform::CUDAPlace>(out_place), out_data,
-        boost::get<platform::CUDAPlace>(in1_place), in1_data,
-        in1_value.numel() * sizeof(T),
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+    memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
+                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T), context.stream());

    auto* in2_data = in2_value.data<T>();
    memory::Copy(boost::get<platform::CUDAPlace>(out_place),
@@ -148,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
    auto in1_height = input1.height();
    PADDLE_ENFORCE_EQ(in1_height, input2->height());

-    framework::Vector<int64_t> in1_rows(input1.rows());
+    auto& in1_rows = input1.rows();
    auto& in2_rows = *(input2->mutable_rows());

    auto& in1_value = input1.value();

--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
@@ -53,15 +53,16 @@ class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
 SamplingId Operator.
 A layer for sampling id from multinomial distribution from the
 input. Sampling one id for one sample.)DOC");
-    AddAttr<float>("min", "Minimum value of random. [default 0.0].")
+    AddAttr<float>("min", "Minimum value of random. (float, default 0.0).")
        .SetDefault(0.0f);
-    AddAttr<float>("max", "Maximun value of random. [default 1.0].")
+    AddAttr<float>("max", "Maximun value of random. (float, default 1.0).")
        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "Random seed used for the random number engine. "
-                 "0 means use a seed generated by the system."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time. [default 0].")
+    AddAttr<int>(
+        "seed",
+        "Random seed used for the random number engine. "
+        "0 means use a seed generated by the system."
+        "Note that if seed is not 0, this operator will always "
+        "generate the same random numbers every time. (int, default 0).")
        .SetDefault(0);
  }
 };

--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -77,8 +77,10 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference {
    auto out_var_name = op_desc.Output("Out").front();
    auto *out_var = block->FindVarRecursive(out_var_name);

-    out_var->SetType(in_var.GetType());
-    out_var->SetDataType(in_var.GetDataType());
+    if (in_var_name != out_var_name) {
+      out_var->SetType(in_var.GetType());
+      out_var->SetDataType(in_var.GetDataType());
+    }
  }
 };


--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
@@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
    }

    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_LT(0, offset_data[i],
+      PADDLE_ENFORCE_LE(0, offset_data[i],
                        "The offset[%d] must greater than zero.", i);
      PADDLE_ENFORCE_LT(0, length_data[i],
                        "The length[%d] must greater than zero.", i);
-      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
+      PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i],
                        lod[0][i + 1], "The target tensor's length overflow.");
    }


--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#define EIGEN_USE_GPU
+#include <algorithm>
 #include "paddle/fluid/operators/sgd_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"

@@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
  }
 }

-template <typename T, int block_size>
+template <typename T>
 __global__ void SparseSGDFunctorKernel(const T* selected_rows,
                                       const int64_t* rows,
                                       const T* learning_rate, T* tensor_out,
-                                       int64_t row_numel) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  selected_rows += ty * row_numel;
-  tensor_out += rows[ty] * row_numel;
-
-  for (int index = tid; index < row_numel; index += block_size) {
-    // Since index in rows of SelectedRows can be duplicate, we have to use
-    // Atomic Operation to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(
-        tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]);
+                                       int64_t row_numel, int64_t limit) {
+  for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) {
+    const T* selected_rows_ptr = selected_rows + i * row_numel;
+    T* tensor_out_ptr = tensor_out + rows[i] * row_numel;
+    for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
+      // Since index in rows of SelectedRows can be duplicate, we have to use
+      // Atomic Operation to avoid concurrent write error.
+      paddle::platform::CudaAtomicAdd(
+          tensor_out_ptr + index,
+          -1.0 * learning_rate[0] * selected_rows_ptr[index]);
+    }
  }
 }
 }  // namespace
@@ -89,7 +88,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);

      auto& in_value = grad->value();
-      framework::Vector<int64_t> in_rows(grad->rows());
+      auto& in_rows = grad->rows();

      int64_t in_row_numel = in_value.numel() / in_rows.size();
      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
@@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
      auto* in_data = in_value.data<T>();
      auto* out_data = param_out->data<T>();

-      const int block_size = 256;
-      dim3 threads(block_size, 1);
-      dim3 grid(1, in_rows.size());
-      SparseSGDFunctorKernel<
-          T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+      const int kThreadsPerBlock = 256;
+      int thread_x = kThreadsPerBlock;
+      int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount();
+      int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+      SparseSGDFunctorKernel<<<max_blocks, thread_x, 0,
+                               ctx.cuda_device_context().stream()>>>(
          in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
-          out_data, in_row_numel);
+          out_data, in_row_numel, in_rows.size());

    } else {
      PADDLE_THROW("Unsupported Variable Type of Grad");

--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -32,7 +32,7 @@ class SumKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
    auto in_vars = context.MultiInputVar("X");
-    int N = in_vars.size();
+    size_t in_num = in_vars.size();
    auto out_var = context.OutputVar("Out");

    bool in_place = out_var == in_vars[0];
@@ -53,7 +53,7 @@ class SumKernel : public framework::OpKernel<T> {
      auto &place =
          *context.template device_context<DeviceContext>().eigen_device();
      // If in_place, just skip the first tensor
-      for (int i = in_place ? 1 : 0; i < N; i++) {
+      for (size_t i = in_place ? 1 : 0; i < in_num; i++) {
        if (in_vars[i]->IsType<framework::LoDTensor>()) {
          auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
          if (in_t.numel() == 0) {
@@ -101,13 +101,13 @@ class SumKernel : public framework::OpKernel<T> {

      // Runtime InferShape
      size_t first_dim = 0;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
        auto &sel_row = get_selected_row(i);
        first_dim += sel_row.rows().size();
      }

      std::vector<int64_t> in_dim;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
        auto &sel_row = get_selected_row(i);
        if (sel_row.rows().size() > 0) {
          in_dim = framework::vectorize(sel_row.value().dims());
@@ -116,14 +116,14 @@ class SumKernel : public framework::OpKernel<T> {
      }
      if (in_dim.empty()) {
        VLOG(3) << "WARNING: all the inputs are empty";
-        in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
+        in_dim =
+            framework::vectorize(get_selected_row(in_num - 1).value().dims());
      } else {
        in_dim[0] = static_cast<int64_t>(first_dim);
      }

      out_value->Resize(framework::make_ddim(in_dim));
      out_value->mutable_data<T>(context.GetPlace());
-
      // if all the input sparse vars are empty, no need to
      // merge these vars.
      if (first_dim == 0UL) {
@@ -133,7 +133,7 @@ class SumKernel : public framework::OpKernel<T> {
      math::SelectedRowsAddTo<DeviceContext, T> functor;

      int64_t offset = 0;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
        auto &sel_row = get_selected_row(i);
        if (sel_row.rows().size() == 0) {
          continue;

--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -22,8 +22,6 @@
 namespace paddle {

 DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
-DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size");
-DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size");

 namespace operators {

@@ -34,6 +32,8 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Ys", "A list of outputs").AsDuplicable();
    AddAttr<std::string>("subgraph", "the subgraph.");
    AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
+    AddAttr<int>("max_batch_size", "the maximum batch size.");
+    AddAttr<int>("workspace_size", "the workspace size.");
    AddComment("TensorRT engine operator.");
  }
 };

--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -28,8 +28,6 @@
 namespace paddle {

 DECLARE_int32(tensorrt_engine_batch_size);
-DECLARE_int32(tensorrt_max_batch_size);
-DECLARE_int32(tensorrt_workspace_size);

 namespace operators {

@@ -92,14 +90,14 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto engine_name = context.Attr<std::string>("engine_uniq_key");
+    int max_batch_size = context.Attr<int>("max_batch_size");
    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
      Prepare(context);
    }
    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
    auto input_names = context.op().Inputs("Xs");
    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
-    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
-                      FLAGS_tensorrt_max_batch_size);
+    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);

    std::vector<std::string> output_maps =
        context.Attr<std::vector<std::string>>("output_name_mapping");
@@ -173,8 +171,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
    // Get the ProgramDesc and pass to convert.
    framework::proto::BlockDesc block_desc;
    block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-    int max_batch = FLAGS_tensorrt_max_batch_size;
-    auto max_workspace = FLAGS_tensorrt_workspace_size;
+    int max_batch_size = context.Attr<int>("max_batch_size");
+    int workspace_size = context.Attr<int>("workspace_size");
+
    auto params = context.Attr<std::vector<std::string>>("parameters");
    std::unordered_set<std::string> parameters;
    for (const auto& param : params) {
@@ -186,7 +185,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {

    // TODO(Superjomn) replace this with a different stream
    auto* engine = Singleton<TRT_EngineManager>::Global().Create(
-        max_batch, max_workspace, nullptr /*engine hold its own stream*/,
+        max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
        context.Attr<std::string>("engine_uniq_key"),
        boost::get<platform::CUDAPlace>(context.GetPlace()).device);


--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -58,8 +58,6 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
 using inference::analysis::SetAttr;

 TEST(TensorRTEngineOp, manual) {
-  FLAGS_tensorrt_engine_batch_size = 2;
-  FLAGS_tensorrt_max_batch_size = 2;
  framework::ProgramDesc program;
  auto* block_ = program.Proto()->add_blocks();
  block_->set_idx(0);
@@ -101,6 +99,8 @@ TEST(TensorRTEngineOp, manual) {
  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                       block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                    std::vector<std::string>({}));
@@ -129,8 +129,6 @@ TEST(TensorRTEngineOp, manual) {
 }

 void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
-  FLAGS_tensorrt_engine_batch_size = batch_size;
-  FLAGS_tensorrt_max_batch_size = batch_size;
  framework::ProgramDesc program;
  framework::Scope scope;
  platform::CUDAPlace place;
@@ -195,8 +193,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {

  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                       block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size);
-  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10);
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
  SetAttr<std::vector<std::string>>(
      engine_op_desc.Proto(), "parameters",
      std::vector<std::string>({"y0", "y1", "y2", "y3"}));

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt

-set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method)
+set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method pass_builder)
 set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
 if(NOT WIN32)
 list(APPEND PYBIND_DEPS parallel_executor profiler)

--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -285,12 +285,12 @@ void BindOpDesc(pybind11::module *m) {
      .def("set_output", &pd::OpDesc::SetOutput)
      .def("input_arg_names", &pd::OpDesc::InputArgumentNames)
      .def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
-      .def("rename_input", &pd::OpDesc::RenameInput)
-      .def("rename_output", &pd::OpDesc::RenameOutput)
+      .def("_rename_input", &pd::OpDesc::RenameInput)
+      .def("_rename_output", &pd::OpDesc::RenameOutput)
      .def("has_attr", &pd::OpDesc::HasAttr)
      .def("attr_type", &pd::OpDesc::GetAttrType)
      .def("attr_names", &pd::OpDesc::AttrNames)
-      .def("set_attr", &pd::OpDesc::SetAttr)
+      .def("_set_attr", &pd::OpDesc::SetAttr)
      .def("attr", &pd::OpDesc::GetAttr)
      .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
      .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr)
@@ -300,8 +300,8 @@ void BindOpDesc(pybind11::module *m) {
             std::string ser(seriralized);
             self.SetAttr(name, ser);
           })
-      .def("block_attr_id", &pd::OpDesc::GetBlockAttrId)
-      .def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
+      .def("_block_attr_id", &pd::OpDesc::GetBlockAttrId)
+      .def("_blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
      .def("check_attrs", &pd::OpDesc::CheckAttrs)
      .def("infer_shape", &pd::OpDesc::InferShape)
      .def("infer_var_type", &pd::OpDesc::InferVarType)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -595,6 +596,29 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("is_profiler_enabled", platform::IsProfileEnabled);
  m.def("reset_profiler", platform::ResetProfiler);

+  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
+  pass.def(py::init())
+      .def("set_str", [](ir::Pass &self, const std::string &name,
+                         const std::string &attr) {
+        self.Set<std::string>(name, new std::string(attr));
+      });
+
+  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
+      m, "PassBuilder");
+  pb.def(py::init())
+      .def("append_pass",
+           [](ir::PassBuilder &self,
+              const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
+             return self.AppendPass(pass_type);
+           })
+      .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
+      .def("insert_pass",
+           [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
+             return self.InsertPass(idx, pass_type);
+           })
+      .def("remove_pass",
+           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
+
  // -- python binds for parallel executor.
  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
@@ -677,7 +701,11 @@ All parameter, weight, gradient are variables in Paddle.
                    },
                    [](BuildStrategy &self, bool b) {
                      self.fuse_elewise_add_act_ops_ = b;
-                    });
+                    })
+      .def("_create_passes_from_strategy",
+           [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
+             return self.CreatePassesFromStrategy();
+           });

  pe.def(py::init<const std::vector<platform::Place> &,
                  const std::unordered_set<std::string> &,

--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
@@ -56,13 +56,13 @@ struct Style {
 };

 template <typename... Args>
-static void PrettyLogEndl(const std::string& style, const char* fmt,
-                          const Args&... args) {
+static void PrettyLogEndl(const std::string &style, const char *fmt,
+                          const Args &... args) {
  std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl;
 }
 template <typename... Args>
-static void PrettyLog(const std::string& style, const char* fmt,
-                      const Args&... args) {
+static void PrettyLog(const std::string &style, const char *fmt,
+                      const Args &... args) {
  std::cerr << style << Sprintf(fmt, args...) << reset();
 }


--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
+function(train_test TARGET_NAME)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+    set(arg_list "")
+    if(train_test_ARGS)
+        foreach(arg ${train_test_ARGS})
+            list(APPEND arg_list "_${arg}")
+        endforeach()
+    else()
+        list(APPEND arg_list "_")
+    endif()
+    foreach(arg ${arg_list})
+        string(REGEX REPLACE "^_$" "" arg "${arg}")
+        cc_test(test_train_${TARGET_NAME}${arg}
+                SRCS test_train_${TARGET_NAME}.cc
+                DEPS paddle_fluid_origin
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
+        set_tests_properties(test_train_${TARGET_NAME}${arg}
+                PROPERTIES DEPENDS test_${TARGET_NAME})
+    endforeach()
+endfunction(train_test)
+
+
+if(WITH_TESTING)
+  train_test(recognize_digits ARGS mlp conv)
+endif()
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <time.h>
+#include <fstream>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/place.h"
+
+DEFINE_string(dirname, "", "Directory of the train model.");
+
+namespace paddle {
+
+void Train() {
+  CHECK(!FLAGS_dirname.empty());
+  framework::InitDevices(false);
+  const auto cpu_place = platform::CPUPlace();
+  framework::Executor executor(cpu_place);
+  framework::Scope scope;
+
+  auto train_program = inference::Load(
+      &executor, &scope, FLAGS_dirname + "__model_combined__.main_program",
+      FLAGS_dirname + "__params_combined__");
+
+  std::string loss_name = "";
+  for (auto op_desc : train_program->Block(0).AllOps()) {
+    if (op_desc->Type() == "mean") {
+      loss_name = op_desc->Output("Out")[0];
+      break;
+    }
+  }
+
+  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+
+  // prepare data
+  auto x_var = scope.Var("img");
+  auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
+  x_tensor->Resize({64, 1, 28, 28});
+
+  auto x_data = x_tensor->mutable_data<float>(cpu_place);
+  for (int i = 0; i < 64 * 28 * 28; ++i) {
+    x_data[i] = 1.0;
+  }
+
+  auto y_var = scope.Var("label");
+  auto y_tensor = y_var->GetMutable<framework::LoDTensor>();
+  y_tensor->Resize({64, 1});
+  auto y_data = y_tensor->mutable_data<int64_t>(cpu_place);
+  for (int i = 0; i < 64 * 1; ++i) {
+    y_data[i] = static_cast<int64_t>(1);
+  }
+
+  auto loss_var = scope.Var(loss_name);
+  float first_loss = 0.0;
+  float last_loss = 0.0;
+  for (int i = 0; i < 100; ++i) {
+    executor.Run(*train_program.get(), &scope, 0, false, true);
+    if (i == 0) {
+      first_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
+    } else if (i == 99) {
+      last_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
+    }
+  }
+  EXPECT_LT(last_loss, first_loss);
+}
+
+TEST(train, recognize_digits) { Train(); }
+
+}  // namespace paddle
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -70,8 +70,8 @@ function cmake_gen() {
    PYTHON_FLAGS=""
    SYSTEM=`uname -s`
    if [ "$SYSTEM" == "Darwin" ]; then
+        echo "Using python abi: $1"
        if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then
-            echo "using python abi: $1"
            if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then
                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
@@ -82,7 +82,18 @@ function cmake_gen() {
            else
                exit 1
            fi
-        # TODO: qiyang add python3 part here 
+        elif [ "$1" == "cp35-cp35m" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.5" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
+                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+            else
+                exit 1
+            fi
        fi
    else 
        if [ "$1" != "" ]; then
@@ -381,7 +392,7 @@ function run_mac_test() {
 EOF

        # TODO: jiabin need to refine this part when these tests fixed on mac
-        ctest --output-on-failure -j8     
+        ctest --output-on-failure -j $1     
        # make install should also be test when unittest 
        make install -j 8
        pip install /usr/local/opt/paddle/share/wheels/*.whl
@@ -629,10 +640,10 @@ EOF

 function gen_capi_package() {
    if [[ ${WITH_C_API} == "ON" ]]; then
-        install_prefix="${PADDLE_ROOT}/build/capi_output"
-        rm -rf $install_prefix
-        make DESTDIR="$install_prefix" install
-        cd $install_prefix/usr/local
+        capi_install_prefix=${INSTALL_PREFIX:-/paddle/build}/capi_output
+        rm -rf $capi_install_prefix
+        make DESTDIR="$capi_install_prefix" install
+        cd $capi_install_prefix/
        ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz
    fi
 }
@@ -729,7 +740,11 @@ function main() {
      maccheck)
        cmake_gen ${PYTHON_ABI:-""}
        build_mac
-        run_mac_test
+        run_mac_test ${PROC_RUN:-1}
+        ;;
+      macbuild)
+        cmake_gen ${PYTHON_ABI:-""}
+        build_mac
        ;;
      cicheck_py35)
        cmake_gen ${PYTHON_ABI:-""}

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -87,6 +87,7 @@ if (WITH_TESTING)
    endif()
  endif()
  add_subdirectory(paddle/fluid/tests)
+  add_subdirectory(paddle/fluid/contrib/tests)
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
    DESTINATION opt/paddle/share/wheels

--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -77,13 +77,14 @@ def download(url, module_name, md5sum, save_name=None):
    retry_limit = 3
    while not (os.path.exists(filename) and md5file(filename) == md5sum):
        if os.path.exists(filename):
-            print("file md5", md5file(filename), md5sum)
+            sys.stderr.write("file %s  md5 %s" % (md5file(filename), md5sum))
        if retry < retry_limit:
            retry += 1
        else:
            raise RuntimeError("Cannot download {0} within retry limit {1}".
                               format(url, retry_limit))
-        print("Cache file %s not found, downloading %s" % (filename, url))
+        sys.stderr.write("Cache file %s not found, downloading %s" %
+                         (filename, url))
        r = requests.get(url, stream=True)
        total_length = r.headers.get('content-length')

@@ -100,10 +101,11 @@ def download(url, module_name, md5sum, save_name=None):
                    dl += len(data)
                    f.write(data)
                    done = int(50 * dl / total_length)
-                    sys.stdout.write("\r[%s%s]" % ('=' * done,
+                    sys.stderr.write("\r[%s%s]" % ('=' * done,
                                                   ' ' * (50 - done)))
                    sys.stdout.flush()
-
+    sys.stderr.write("\n")
+    sys.stdout.flush()
    return filename



--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -89,7 +89,8 @@ def reader_creator(tar_file, file_name, dict_size):
            ]
            for name in names:
                for line in f.extractfile(name):
-                    line_split = line.strip().split(six.b('\t'))
+                    line = cpt.to_text(line)
+                    line_split = line.strip().split('\t')
                    if len(line_split) != 2:
                        continue
                    src_seq = line_split[0]  # one source sequence

--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -64,7 +64,8 @@ def __build_dict(tar_file, dict_size, save_path, lang):
    word_dict = defaultdict(int)
    with tarfile.open(tar_file, mode="r") as f:
        for line in f.extractfile("wmt16/train"):
-            line_split = line.strip().split(six.b("\t"))
+            line = cpt.to_text(line)
+            line_split = line.strip().split("\t")
            if len(line_split) != 2: continue
            sen = line_split[0] if lang == "en" else line_split[1]
            for w in sen.split():
@@ -123,7 +124,8 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):

        with tarfile.open(tar_file, mode="r") as f:
            for line in f.extractfile(file_name):
-                line_split = line.strip().split(six.b("\t"))
+                line = cpt.to_text(line)
+                line_split = line.strip().split("\t")
                if len(line_split) != 2:
                    continue
                src_words = line_split[src_col].split()

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -38,8 +38,8 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
        op_desc = op_descs[i]
        if isinstance(op_desc, tuple):
            op_desc = op_desc[0]
-        op_desc.rename_input(old_name, new_name)
-        op_desc.rename_output(old_name, new_name)
+        op_desc._rename_input(old_name, new_name)
+        op_desc._rename_output(old_name, new_name)


 def _create_op_desc_(op_type, inputs, outputs, attrs):
@@ -70,7 +70,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
        if isinstance(val, framework.Block):
            op_desc.set_block_attr(name, val.desc)
        else:
-            op_desc.set_attr(name, val)
+            op_desc._set_attr(name, val)
    return op_desc


@@ -346,7 +346,7 @@ def _append_backward_ops_(block,
        grad_sub_block_list = []
        # If the op has its own sub-block, deal with the sub-block first
        if op.has_attr("sub_block"):
-            sub_block = program.block(op.block_attr_id("sub_block"))
+            sub_block = program.block(op._block_attr_id("sub_block"))
            grad_sub_block = program._create_block()
            grad_sub_block._set_forward_block_idx(sub_block.idx)
            cb = _callback_lookup_(op)
@@ -382,7 +382,7 @@ def _append_backward_ops_(block,
    for op_desc in grad_op_descs:
        new_op_desc = target_block.desc.append_op()
        new_op_desc.copy_from(op_desc)
-        new_op_desc.set_attr(op_role_attr_name, backward)
+        new_op_desc._set_attr(op_role_attr_name, backward)
        grad_to_var["__current_op_desc__"] = new_op_desc
        if callbacks is not None:
            assert (isinstance(callbacks, list))
@@ -408,7 +408,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
    for op_idx in range(start_op_idx, block.desc.op_size()):
        op_desc = block.desc.op(op_idx)
        if op_desc.has_attr("sub_block"):
-            sub_block = block.program.block(op_desc.block_attr_id("sub_block"))
+            sub_block = block.program.block(op_desc._block_attr_id("sub_block"))
            _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
        new_vars = set()
        # create new gradient variables
@@ -438,12 +438,12 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
        op_desc = block.desc.op(op_idx)
        for name in op_desc.input_arg_names():
            if name in var_map:
-                op_desc.rename_input(name, var_map[name])
+                op_desc._rename_input(name, var_map[name])

        for name in op_desc.output_arg_names():
            if block.desc.find_var(name.encode("ascii")):
                new_name = unique_name.generate(name)
-                op_desc.rename_output(name, new_name)
+                op_desc._rename_output(name, new_name)
                var_map[name] = new_name

    for g, ng in six.iteritems(var_map):
@@ -542,9 +542,9 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
        if loss.op is None:
            raise ValueError("loss.op is None. Should not happend")

-    loss.op.set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(),
-                     int(core.op_proto_and_checker_maker.OpRole.Forward) |
-                     int(core.op_proto_and_checker_maker.OpRole.Loss))
+    loss.op._set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(),
+                      int(core.op_proto_and_checker_maker.OpRole.Forward) |
+                      int(core.op_proto_and_checker_maker.OpRole.Loss))

    if callbacks is not None:
        isinstance(callbacks, list)
@@ -631,7 +631,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
        attr_val = [p.name, g.name]
        if g.op.has_attr(op_role_var_attr_name):
            attr_val.extend(g.op.attr(op_role_var_attr_name))
-        g.op.set_attr(op_role_var_attr_name, attr_val)
+        g.op._set_attr(op_role_var_attr_name, attr_val)

    return params_and_grads


--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -75,8 +75,8 @@ class ErrorClipByValue(BaseErrorClipAttr):
        clip_op_desc.set_type("clip")
        clip_op_desc.set_input("X", [grad_name])
        clip_op_desc.set_output("Out", [grad_name])
-        clip_op_desc.set_attr("min", self.min)
-        clip_op_desc.set_attr("max", self.max)
+        clip_op_desc._set_attr("min", self.min)
+        clip_op_desc._set_attr("max", self.max)


 def error_clip_callback(block, context):

--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -18,5 +18,13 @@ from . import decoder
 from .decoder import *
 from . import memory_usage_calc
 from .memory_usage_calc import *
+from . import op_frequence
+from .op_frequence import *
+from . import quantize
+from .quantize import *

-__all__ = decoder.__all__ + memory_usage_calc.__all__
+__all__ = []
+__all__ += decoder.__all__
+__all__ += memory_usage_calc.__all__
+__all__ += op_frequence.__all__
+__all__ += quantize.__all__
--- a/python/paddle/fluid/contrib/op_frequence.py
+++ b/python/paddle/fluid/contrib/op_frequence.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from collections import OrderedDict
+
+from ..framework import Program
+
+__all__ = ['op_freq_statistic']
+
+
+def op_freq_statistic(program):
+    """
+    Statistics of Op frequency.
+
+    Args:
+        program(Program): The current Program.
+
+    Returns:
+        uni_op_freq(dict): the single op frequency.
+        adj_2_op_freq(dict): the two adjacent ops frequency.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> uni_op_freq, adj_2_op_freq = fluid.contrib.op_freq_statistic(
+        >>>        fluid.default_main_program())
+        >>> for op_type, op_num in uni_op_freq:
+        >>>     print("%s  \t  %d" % (op_type, op_num))
+        >>> for op_type, op_num in adj_2_op_freq:
+        >>>     print("%s  \t  %d" % (op_type, op_num))
+
+    """
+
+    if not isinstance(program, Program):
+        raise TypeError("The input type should be Porgram."
+                        "But you passed in %s" % (type(program)))
+
+    uni_op_freq = OrderedDict()
+    adj_2_op_freq = OrderedDict()
+    op_in_ops = OrderedDict()
+
+    parameters = [p.name for p in program.blocks[0].all_parameters()]
+
+    # get uni_op_freq
+    for op in program.global_block().ops:
+        had_recorded = False
+        for var_name in op.output_arg_names:
+            if var_name in parameters:
+                continue
+            if not had_recorded and uni_op_freq.has_key(op.type):
+                uni_op_freq[op.type] += 1
+                had_recorded = True
+            elif not had_recorded:
+                uni_op_freq[op.type] = 1
+                had_recorded = True
+
+    # get adj_2_op_freq
+    var_gen_op = {}
+    for op in program.global_block().ops:
+        for var_name in op.input_arg_names:
+            if var_name in parameters:
+                continue
+            if var_gen_op.has_key(var_name):
+                assert len(var_gen_op[var_name]) > 0
+                if op_in_ops.has_key(op.type):
+                    op_in_ops[op.type].append(var_gen_op[var_name][-1])
+                else:
+                    op_in_ops[op.type] = [var_gen_op[var_name][-1]]
+            else:
+                print("Var's generate op is not found,%s, %s" %
+                      (var_name, op.type))
+
+        for var_name in op.output_arg_names:
+            if var_gen_op.has_key(var_name):
+                var_gen_op[var_name].append(op.type)
+            else:
+                var_gen_op[var_name] = [op.type]
+
+    for op, in_ops in op_in_ops.iteritems():
+        for in_op in in_ops:
+            op_op = in_op + "->" + op
+            if adj_2_op_freq.has_key(op_op):
+                adj_2_op_freq[op_op] += 1
+            else:
+                adj_2_op_freq[op_op] = 1
+
+    uni_op_freq = sorted(
+        uni_op_freq.items(), key=lambda item: item[1], reverse=True)
+    adj_2_op_freq = sorted(
+        adj_2_op_freq.items(), key=lambda item: item[1], reverse=True)
+
+    return uni_op_freq, adj_2_op_freq
--- a/python/paddle/fluid/contrib/quantize/__init__.py
+++ b/python/paddle/fluid/contrib/quantize/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import quantize_transpiler
+from .quantize_transpiler import *
+
+__all__ = quantize_transpiler.__all__
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import numpy as np
+
+from paddle.fluid.framework import default_main_program, default_startup_program, program_guard
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid import unique_name
+from paddle.fluid import core
+from paddle.fluid.initializer import Constant
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.layers.nn import autoincreased_step_counter
+from paddle.fluid.framework import Variable
+from paddle.fluid.executor import global_scope
+from paddle.fluid.transpiler.inference_transpiler import InferenceTranspiler
+
+__all__ = ['QuantizeTranspiler']
+
+_QUANTIZABLE_OP_TYPES = ['conv2d', 'depthwise_conv2d', 'mul']
+
+
+def _quantized_var_name(var_name):
+    """
+    Return quantized variable name for the input `var_name`.
+    """
+    return "%s.quantized" % (var_name)
+
+
+def _dequantized_var_name(var_name):
+    """
+    Return dequantized variable name for the input `var_name`.
+    """
+    return "%s.dequantized" % (var_name)
+
+
+def _quantized_scale_name(var_name):
+    """
+    Return quantized variable name for the input `var_name`.
+    """
+    return "%s.scale" % (var_name)
+
+
+def _original_var_name(var_name):
+    """
+    Return the original variable name.
+    """
+    if var_name.endswith('.quantized.dequantized'):
+        return var_name[:-len('.quantized.dequantized')]
+    if var_name.endswith('.quantized'):
+        return var_name[:-len('.quantized')]
+    if var_name.endswith('.dequantized'):
+        return var_name[:-len('.dequantized')]
+    if var_name.endswith('.scale'):
+        return var_name[:-len('.scale')]
+    else:
+        return var_name
+
+
+def _is_float(v):
+    return isinstance(v, float) or isinstance(v, np.float32)
+
+
+def quant(x, scale, num_bits):
+    y = np.round(x / scale * ((1 << (num_bits - 1)) - 1))
+    return y
+
+
+class QuantizeTranspiler(object):
+    def __init__(self,
+                 weight_bits=8,
+                 activation_bits=8,
+                 activation_quantize_type='abs_max',
+                 weight_quantize_type='abs_max',
+                 window_size=10000):
+        """
+        Convert and rewrite the fluid Program according to weight and
+        activation quantization type.
+
+        Args:
+            weight_bits (int): quantization bit number for weights,
+                the bias is not quantized.
+            activation_bits (int): quantization bit number for activation.
+            activation_quantize_type (str): quantization type for activation,
+                now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode,
+                the quantization scale will be calculated dynamically each step
+                in both training and testing period. If use 'range_abs_max',
+                a static quantization scale will be calculated during training
+                and used in inference.
+            weight_quantize_type (str): quantization type for weights,
+                support 'abs_max'. The 'range_abs_max' usually is not used for
+                weight, since weights are fixed once the model is well trained.
+            window_size (int): the window size for 'range_abs_max' quantization.
+
+        Examples:
+
+        .. code-block:: python
+
+            # the original program will be rewrite, if you don't want to
+            # change it, please clone at first.
+            # quantize_program = program.clone()
+            t = fluid.QuantizeTranspiler()
+            t.transpile(quantize_program)
+
+        """
+        self.weight_bits = weight_bits
+        self.activation_bits = activation_bits
+        quant_type = ['abs_max', 'range_abs_max']
+        if weight_quantize_type not in quant_type:
+            raise ValueError(
+                "Unknown weight_quantize_type: '%s'. It can only be ",
+                "'abs_max' or 'range_abs_max'.", str(weight_quantize_type))
+        if activation_quantize_type not in quant_type:
+            raise ValueError(
+                "Unknown activation_quantize_type : '%s'. It can only be ",
+                "'abs_max' or 'range_abs_max'.", str(activation_quantize_type))
+
+        self.weight_quantize_type = weight_quantize_type
+        self.activation_quantize_type = activation_quantize_type
+
+        self.window_size = window_size
+        self.helper = LayerHelper(self.__class__.__name__)
+        self.fake_quant_op_types = [
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
+        ]
+        self.fake_dequant_op_types = ['fake_dequantize_max_abs']
+        self.is_test = None
+        self.global_step = None
+
+    def training_transpile(self, program=None, startup_program=None):
+        """Rewrites a training input program in place for simulated
+        quantization. Insert fake quantization and de-quantization ops into
+        program to simulate the error introduced by quantization. And change
+        the graident ops' input by using the faked quantization weights and
+        activation. Since the program is transformed in place, the graph
+        connection will change.
+
+        Args:
+            program (Program): the input program to be transpile.
+        """
+        self.is_test = False
+        program = default_main_program() if program is None else program
+        startup_program = default_startup_program() if startup_program is \
+            None else startup_program
+
+        # marked the variable which has been quantized and dequantized.
+        dequanted_vars = [
+            collections.OrderedDict() for _ in range(len(program.blocks))
+        ]
+        grad_op_types = ['%s_grad' % (type) for type in _QUANTIZABLE_OP_TYPES]
+
+        params = [p.name for p in program.global_block().iter_parameters()]
+
+        def _transpile_forward(block, op):
+            idx = block.ops.index(op)
+            block_id = block.idx
+            # insert quant op and dequant op
+            for name in op.input_arg_names:
+                if name in dequanted_vars[block_id]:
+                    dequant_var = dequanted_vars[block_id][name]
+                else:
+                    var = block.var(name)
+                    quant_bits = self.weight_bits if var.name in params \
+                                 else self.activation_bits
+                    quant_type = self.weight_quantize_type if var.name \
+                        in params else self.activation_quantize_type
+
+                    quant_var, scale_var = self._insert_quant_op(
+                        block, idx, var, quant_bits, quant_type)
+                    dequant_var = self._insert_dequant_op(
+                        block, idx + 1, quant_var, scale_var, quant_bits)
+                    dequanted_vars[block_id][name] = dequant_var
+                # rename the forward op inputs
+                op._rename_input(name, dequant_var.name)
+
+        def _transpile_backward(block, op):
+            block_id = block.idx
+            no_dequanted_input_vars = True
+            for name in op.input_arg_names:
+                if name in dequanted_vars[block_id]:
+                    dequant_var = dequanted_vars[block_id][name]
+                    op._rename_input(name, dequant_var.name)
+                    no_dequanted_input_vars = False
+            if no_dequanted_input_vars:
+                raise ValueError("There is no dequanted inputs for op %s." %
+                                 (op.type))
+
+        with program_guard(program, startup_program):
+            self._create_global_step()
+            for block in program.blocks:
+                ops = list(block.ops)
+                block_id = block.idx
+                for op in ops:
+                    # rewrite the forward ProgramDes
+                    if op.type in _QUANTIZABLE_OP_TYPES:
+                        _transpile_forward(block, op)
+                    # rename the backward op inputs
+                    if op.type in grad_op_types:
+                        _transpile_backward(block, op)
+
+    def _create_global_step(self):
+        if self.weight_quantize_type == 'range_abs_max' or \
+            self.activation_quantize_type == 'range_abs_max':
+            self.global_step = autoincreased_step_counter()
+
+    def freeze_program(self, program, place, fuse_bn=False, scope=None):
+        """Freeze input training program for inference.
+
+        Args:
+            program (Program): the input program to be transpile.
+        """
+
+        self.is_test = True
+        scope = global_scope() if scope is None else scope
+        program = default_main_program() if program is None else program
+
+        if fuse_bn:
+            bn_fuse_transpiler = BNFuseTranspiler()
+            bn_fuse_transpiler.transpile(program, place)
+
+        persistable_vars = [
+            v.name
+            for v in filter(lambda var: var.persistable, program.list_vars())
+        ]
+        op_in_rename_map = [
+            collections.OrderedDict() for _ in range(len(program.blocks))
+        ]
+        op_out_rename_map = [
+            collections.OrderedDict() for _ in range(len(program.blocks))
+        ]
+        var_scale_map = [
+            collections.OrderedDict() for _ in range(len(program.blocks))
+        ]
+
+        def _remove_fake_quant_and_dequant_op(block, op):
+            idx = block.ops.index(op)
+            block_id = block.idx
+            k = op.output('Out')[0]
+            v = op.input('X')[0]
+            if v not in op_in_rename_map[block_id]:
+                op_in_rename_map[block_id][k] = v
+            else:
+                op_in_rename_map[block_id][k] = op_in_rename_map[block_id][v]
+            block._remove_op(idx)
+
+        def _insert_post_dequant_op(block, op):
+            idx = block.ops.index(op)
+            block_id = block.idx
+            max_range = None
+            scale_var = None
+            for name in op.input_arg_names:
+                if name in op_in_rename_map[block_id]:
+                    op._rename_input(name, op_in_rename_map[block_id][name])
+
+                scale_v = var_scale_map[block_id][_original_var_name(name)]
+                if _original_var_name(name) in persistable_vars:
+                    param_range = (1 << (self.weight_bits - 1)) - 1
+                    act_range = (1 << (self.activation_bits - 1)) - 1
+                    assert _is_float(scale_v)
+                    max_range = param_range * act_range / scale_v
+                else:
+                    assert isinstance(scale_v, Variable)
+                    scale_var = var_scale_map[block_id][_original_var_name(
+                        name)]
+
+            if len(op.output_arg_names) != 1:
+                raise ValueError("Only support one output, but op %s has"
+                                 " more than one output." % (op.type))
+            out_var = block.var(op.output_arg_names[0])
+            dequant_var = block.create_var(
+                name=_dequantized_var_name(out_var.name),
+                type=out_var.type,
+                shape=out_var.shape,
+                dtype=out_var.dtype)
+            # insert fake_dequantize_op
+            dequant_op = block._insert_op(
+                idx + 1,
+                type="fake_dequantize_max_abs",
+                attrs={'max_range': float(max_range)},
+                inputs={"X": out_var,
+                        'Scale': scale_var},
+                outputs={"Out": dequant_var})
+            op_out_rename_map[block_id][out_var.name] = dequant_var.name
+            return dequant_var
+
+        def _load_var(name):
+            return np.array(scope.find_var(name).get_tensor())
+
+        def _restore_var(name, arr):
+            t = scope.find_var(name).get_tensor()
+            t.set(arr, place)
+
+        for block in program.blocks:
+            ops = list(block.ops)
+            block_id = block.idx
+            for op in ops:
+                op_type = op.type
+
+                # insert dequant_op after fc/conv, need to rename
+                # input of the followed ops
+                for name in op.input_arg_names:
+                    if name in op_out_rename_map[block_id]:
+                        op._rename_input(name,
+                                         op_out_rename_map[block_id][name])
+
+                if op_type in self.fake_quant_op_types:
+                    in_arg_name = op.input('X')[0]
+                    if in_arg_name in persistable_vars:
+                        if self.weight_quantize_type == 'abs_max':
+                            param = _load_var(in_arg_name)
+                            scale_v = np.max(np.abs(param))
+                        else:
+                            scale_v = _load_var(op.output('OutScale')[0])
+                        var_scale_map[block_id][in_arg_name] = scale_v
+                    else:
+                        scale_v = block.var(op.output('OutScale')[0])
+                        var_scale_map[block_id][in_arg_name] = scale_v
+
+                    if in_arg_name in persistable_vars:
+                        _remove_fake_quant_and_dequant_op(block, op)
+                        # quantize weight and restore
+                        param_t = _load_var(in_arg_name)
+                        param_q_t = quant(param_t, scale_v, self.weight_bits)
+                        _restore_var(in_arg_name, param_q_t)
+
+                if op_type in self.fake_dequant_op_types:
+                    _remove_fake_quant_and_dequant_op(block, op)
+
+                if op_type in _QUANTIZABLE_OP_TYPES:
+                    dequant_var = _insert_post_dequant_op(block, op)
+
+        # remove the unused var in ProgramDesc
+        self._remove_unused_var(program)
+        #program = program.clone()
+
+    def convert_to_int8(self, program, place, scope=None):
+        scope = global_scope() if scope is None else scope
+        program = default_main_program() if program is None else program
+
+        def _load_var(name):
+            return np.array(scope.find_var(name).get_tensor())
+
+        global_block = program.global_block()
+
+        def convert_to_int8(var):
+            int8_var_name = var.name + ".int8"
+            int8_var = global_block.create_parameter(
+                name=int8_var_name.encode('ascii'),
+                type=var.type,
+                dtype=core.VarDesc.VarType.INT8,
+                shape=var.shape)
+
+            tensor = _load_var(var.name)
+
+            scope.var(int8_var_name)
+            int8_tensor = scope.find_var(int8_var_name).get_tensor()
+            int8_tensor.set(tensor.astype(np.int8), place)
+            return int8_var
+
+        input_map = {}
+        for block in program.blocks:
+            for op in list(block.ops):
+                if op.type in _QUANTIZABLE_OP_TYPES:
+                    for name in op.input_arg_names:
+                        var = block.var(name)
+                        if var.persistable:
+                            if name not in input_map:
+                                int8_var = convert_to_int8(var)
+                                input_map[name] = int8_var.name
+                            op._rename_input(name, input_map[name])
+        self._remove_unused_var(program)
+
+    def _remove_unused_var(self, program):
+        all_remove_vars = []
+        for block in program.blocks:
+            args = []
+            for op in block.ops:
+                args += op.input_arg_names
+                args += op.output_arg_names
+            args = list(set(args))
+            var_names = block.vars.keys()
+            sub_block_remove_vars = []
+            for var in var_names:
+                if var not in args:
+                    sub_block_remove_vars.append(var)
+            all_remove_vars.append(sub_block_remove_vars)
+
+        remove_vars = [list(set(v)) for v in all_remove_vars]
+        for i, block in enumerate(program.blocks):
+            for v in remove_vars[i]:
+                block._remove_var(v)
+
+    def _insert_quant_abs_max_op(self, block, idx, var, quant_bits):
+        """Insert fake_quantize_abs_max op.
+        """
+        quant_var = block.create_var(
+            name=_quantized_var_name(var.name),
+            type=var.type,
+            shape=var.shape,
+            dtype=var.dtype)
+        scale = block.create_var(
+            name=_quantized_scale_name(var.name),
+            type=var.type,
+            shape=var.shape,
+            dtype=var.dtype)
+        quant_op = block._insert_op(
+            idx,
+            type='fake_quantize_abs_max',
+            attrs={'bit_length': quant_bits},
+            inputs={'X': var},
+            outputs={'Out': quant_var,
+                     'OutScale': scale})
+        return quant_var, scale
+
+    def _insert_quant_range_abs_max_op(self, block, idx, var, quant_bits):
+        """Insert fake_quantize_range_abs_max
+        """
+        quant_var = block.create_var(
+            name=_quantized_var_name(var.name),
+            type=var.type,
+            shape=var.shape,
+            dtype=var.dtype)
+        scale = self.helper.create_parameter(
+            attr=ParamAttr(
+                name=_quantized_scale_name(var.name),
+                initializer=Constant(0.001),
+                trainable=False),
+            shape=[1],
+            dtype=var.dtype)
+        scale.stop_gradient = True
+
+        ins = {'X': var, 'InScale': scale}
+        outs = {'Out': quant_var, 'OutScale': scale}
+        if not self.is_test:
+            # A global step counter variable with type int64
+            scales = self.helper.create_global_variable(
+                name=unique_name.generate('scales'),
+                persistable=True,
+                dtype=var.dtype,
+                shape=[self.window_size])
+            self.helper.set_variable_initializer(
+                scales, initializer=Constant(value=0))
+
+            ins['Iter'] = self.global_step
+            outs['OutScales'] = scales
+
+        attrs = {
+            'window_size': self.window_size,
+            'bit_length': quant_bits,
+            'is_test': self.is_test
+        }
+
+        quant_op = block._insert_op(
+            idx,
+            type='fake_quantize_range_abs_max',
+            attrs=attrs,
+            inputs=ins,
+            outputs=outs)
+
+        return quant_var, scale
+
+    def _insert_quant_op(self, block, idx, var, quant_bits, quant_type):
+        """
+        Insert fake_quantize_op
+        """
+        if quant_type == 'abs_max':
+            return self._insert_quant_abs_max_op(block, idx, var, quant_bits)
+        elif quant_type == 'range_abs_max':
+            return self._insert_quant_range_abs_max_op(block, idx, var,
+                                                       quant_bits)
+
+    def _insert_dequant_op(self, block, idx, var, scale, quant_bits):
+        """
+        Insert fake_quantize_op
+        """
+        dequant_var = block.create_var(
+            name=_dequantized_var_name(var.name),
+            type=var.type,
+            shape=var.shape,
+            dtype=var.dtype)
+        # insert fake_dequantize_op
+        max_range = (1 << (quant_bits - 1)) - 1
+        dequant_op = block._insert_op(
+            idx,
+            type="fake_dequantize_max_abs",
+            attrs={'max_range': float(max_range)},
+            inputs={"X": var,
+                    'Scale': scale},
+            outputs={"Out": dequant_var})
+        return dequant_var
+
+
+class BNFuseTranspiler(InferenceTranspiler):
+    def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
+        def _update_param(op, param_name, new_param):
+            var = self.block.vars[param_name]
+            tensor = self.scope.find_var(param_name).get_tensor()
+            tensor.set(np.array(new_param), self.place)
+
+        def _load_param(param_name):
+            return np.array(self.scope.find_var(param_name).get_tensor())
+
+        bias_bn = _load_param(bn_op.input("Bias")[0])  #Bias
+        scale_bn = _load_param(bn_op.input("Scale")[0])  #Scale
+        mean_bn = _load_param(bn_op.input("Mean")[0])  #Mean
+        var_bn = _load_param(bn_op.input("Variance")[0])  #Variance
+
+        if current_op.type in ['conv2d', 'depthwise_conv2d']:
+            current_param = _load_param(
+                _original_var_name(current_op.input("Filter")[0]))
+        elif current_op.type == 'mul':
+            current_param = _load_param(
+                _original_var_name(current_op.input("Y")[0]))
+
+        std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
+        tmp = np.float32(np.divide(scale_bn, std_bn))
+
+        # add bias of batch_norm_op to conv2d
+        if with_bias:
+            bias = _load_param(bias_op.input("Y"))
+        else:
+            bias = np.zeros(bias_bn.shape)
+        bias = np.float32(
+            np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
+
+        # re-compute weight of conv2d/fc
+        tmp = tmp.reshape(tmp.shape[0], -1)
+        dst_param = current_param.reshape((tmp.shape[0], -1))
+        dst_param = np.float32(np.multiply(dst_param, tmp))
+        dst_param = dst_param.reshape(current_param.shape)
+
+        # update parameters
+        if current_op.type in ['conv2d', 'depthwise_conv2d']:
+            _update_param(current_op,
+                          _original_var_name(current_op.input("Filter")[0]),
+                          dst_param)
+        elif current_op.type == 'mul':
+            _update_param(current_op,
+                          _original_var_name(current_op.input("Y")[0]),
+                          dst_param)
+
+        _update_param(bias_op, bias_op.input("Y")[0], bias)
+
+        # collect the renamed input
+        self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import numpy as np
+import six
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.contrib.quantize.quantize_transpiler import _original_var_name
+from paddle.fluid.contrib.quantize.quantize_transpiler import QuantizeTranspiler
+
+
+def linear_fc(num):
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        hidden = fluid.layers.fc(hidden, size=128, act='relu')
+    loss = fluid.layers.cross_entropy(input=hidden, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def residual_block(num):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
+        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
+        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
+    fc = fluid.layers.fc(input=hidden, size=10)
+    loss = fluid.layers.cross_entropy(input=fc, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def conv_net(img, label):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+    return avg_loss
+
+
+class TestQuantizeTranspiler(unittest.TestCase):
+    def setUp(self):
+        # since quant_op and dequant_op is not ready, use cos and sin for test
+        self.weight_quant_op_type = 'fake_quantize_abs_max'
+        self.dequant_op_type = 'fake_dequantize_max_abs'
+        self.quantizable_op_and_inputs = {
+            'conv2d': ['Input', 'Filter'],
+            'depthwise_conv2d': ['Input', 'Filter'],
+            'mul': ['X', 'Y']
+        }
+        self.quantizable_op_grad_and_inputs = {
+            'conv2d_grad': ['Input', 'Filter'],
+            'depthwise_conv2d_grad': ['Input', 'Filter'],
+            'mul_grad': ['X', 'Y']
+        }
+
+    def check_program(self, program):
+        quantized_ops = {}
+
+        persistable_vars = [
+            v.name
+            for v in filter(lambda var: var.persistable, program.list_vars())
+        ]
+
+        for block in program.blocks:
+            for idx, op in enumerate(block.ops):
+                # check forward
+                if op.type in self.quantizable_op_and_inputs:
+                    for i, arg_name in enumerate(op.input_arg_names):
+                        quant_op_type = self.weight_quant_op_type if \
+                            _original_var_name(arg_name) \
+                            in persistable_vars else self.act_quant_op_type
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        if arg_name not in quantized_ops:
+                            self.assertEqual(block.ops[idx - 2 * i - 1].type,
+                                             self.dequant_op_type)
+                            self.assertEqual(block.ops[idx - 2 * i - 2].type,
+                                             quant_op_type)
+                            quantized_ops[arg_name] = block.ops[idx - 2 * i - 2]
+                        else:
+                            op_idx = block.ops.index(quantized_ops[arg_name])
+                            self.assertLess(op_idx, idx)
+
+                # check backward
+                if op.type in self.quantizable_op_grad_and_inputs:
+                    for pname in self.quantizable_op_grad_and_inputs[op.type]:
+                        arg_name = op.input(pname)[0]
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        self.assertTrue(arg_name in quantized_ops)
+
+    def linear_fc_quant(self, quant_type):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = linear_fc(3)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+            t = QuantizeTranspiler(activation_quantize_type=quant_type)
+            t.training_transpile(main)
+            self.check_program(main)
+
+    def test_linear_fc_quant_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_abs_max'
+        self.linear_fc_quant('abs_max')
+
+    def test_linear_fc_quant_range_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_range_abs_max'
+        self.linear_fc_quant('range_abs_max')
+
+    def residual_block_quant(self, quant_type):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = residual_block(2)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+            t = QuantizeTranspiler(activation_quantize_type=quant_type)
+            t.training_transpile(main)
+            self.check_program(main)
+
+    def test_residual_block_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_abs_max'
+        self.residual_block_quant('abs_max')
+
+    def test_residual_block_range_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_range_abs_max'
+        self.residual_block_quant('range_abs_max')
+
+    def freeze_program(self, use_cuda):
+        def build_program(main, startup, is_test):
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    img = fluid.layers.data(
+                        name='image', shape=[1, 28, 28], dtype='float32')
+                    label = fluid.layers.data(
+                        name='label', shape=[1], dtype='int64')
+                    loss = conv_net(img, label)
+                    if not is_test:
+                        opt = fluid.optimizer.Adam(learning_rate=0.001)
+                        opt.minimize(loss)
+            return [img, label], loss
+
+        main = fluid.Program()
+        startup = fluid.Program()
+        test_program = fluid.Program()
+
+        feeds, loss = build_program(main, startup, False)
+        build_program(test_program, startup, True)
+        test_program = test_program.clone(for_test=True)
+
+        quant_transpiler = QuantizeTranspiler()
+        quant_transpiler.training_transpile(main)
+        quant_transpiler.training_transpile(test_program)
+
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        iter = 5
+        batch_size = 8
+        class_num = 10
+        exe.run(startup)
+
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=500),
+            batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
+
+        with fluid.program_guard(main):
+            for _ in range(iter):
+                data = next(train_reader())
+                loss_v = exe.run(program=main,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+
+        with fluid.program_guard(test_program):
+            test_data = next(test_reader())
+            w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
+                                             test_program)
+            # Testing during training
+            test_loss1, w_quant = exe.run(program=test_program,
+                                          feed=feeder.feed(test_data),
+                                          fetch_list=[loss, w_var])
+
+            # Freeze program for inference, but the weight of fc/conv is still float type.
+            quant_transpiler.freeze_program(test_program, place)
+            test_loss2, = exe.run(program=test_program,
+                                  feed=feeder.feed(test_data),
+                                  fetch_list=[loss])
+            self.assertAlmostEqual(test_loss1, test_loss2, delta=1e-3)
+            w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0')
+                                .get_tensor())
+            self.assertEqual(np.sum(w_freeze), np.sum(w_quant))
+
+            # Convert parameter to 8-bit.
+            quant_transpiler.convert_to_int8(test_program, place)
+            # Save the 8-bit parameter and model file.
+            fluid.io.save_inference_model('model_8bit', ['image', 'label'],
+                                          [loss], exe, test_program)
+            # Test whether the 8-bit parameter and model file can be loaded successfully.
+            [infer, feed, fetch] = fluid.io.load_inference_model('model_8bit',
+                                                                 exe)
+            # Check the loaded 8-bit weight.
+            w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8')
+                              .get_tensor())
+
+            self.assertEqual(w_8bit.dtype, np.int8)
+            self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
+
+    def test_freeze_program_cuda(self):
+        if fluid.core.is_compiled_with_cuda():
+            with fluid.unique_name.guard():
+                self.freeze_program(True)
+
+    def test_freeze_program_cpu(self):
+        with fluid.unique_name.guard():
+            self.freeze_program(False)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -40,11 +40,9 @@ PADDLE_ON_MODEL_CE = os.environ.get('PADDLE_ON_MODEL_CE', None) is not None

 __all__ = [
    'Program',
-    'Operator',
    'default_startup_program',
    'default_main_program',
    'program_guard',
-    'get_var',
    'name_scope',
 ]

@@ -663,11 +661,11 @@ class Operator(object):
                self._update_desc_attr(attr_name, attr_val)

        self.desc.check_attrs()
-        if self.has_kernel(type):
+        if self._has_kernel(type):
            self.desc.infer_var_type(self.block.desc)
            self.desc.infer_shape(self.block.desc)

-    def has_kernel(self, op_type):
+    def _has_kernel(self, op_type):
        return op_type not in self.OP_WITHOUT_KERNEL_SET

    def to_string(self, throw_on_error):
@@ -708,7 +706,7 @@ class Operator(object):
        """
        return self.desc.input(name)

-    def rename_input(self, old_name, new_name):
+    def _rename_input(self, old_name, new_name):
        """
        Rename the `old_name` to `new_name`.

@@ -719,9 +717,9 @@ class Operator(object):
        Returns:
            None
        """
-        self.desc.rename_input(old_name, new_name)
+        self.desc._rename_input(old_name, new_name)

-    def rename_output(self, old_name, new_name):
+    def _rename_output(self, old_name, new_name):
        """
        Rename the `old_name` to `new_name`.

@@ -732,7 +730,7 @@ class Operator(object):
        Returns:
            None
        """
-        self.desc.rename_output(old_name, new_name)
+        self.desc._rename_output(old_name, new_name)

    @property
    def input_names(self):
@@ -796,7 +794,7 @@ class Operator(object):
        """
        return self.desc.attr_type(name)

-    def set_attr(self, name, val):
+    def _set_attr(self, name, val):
        """
        Set the value of attribute by attribute's name.

@@ -829,7 +827,7 @@ class Operator(object):
                isinstance(val, core.ProgramDesc):
            self.desc.set_serialized_attr(name, val.serialize_to_string())
        else:
-            self.desc.set_attr(name, val)
+            self.desc._set_attr(name, val)

    @property
    def attr_names(self):
@@ -848,7 +846,7 @@ class Operator(object):
        """
        return self.desc.attr(name)

-    def block_attr_id(self, name):
+    def _block_attr_id(self, name):
        """
        Get the block attribute's id by name.

@@ -858,9 +856,9 @@ class Operator(object):
        Returns:
            int: the block index.
        """
-        return self.desc.block_attr_id(name)
+        return self.desc._block_attr_id(name)

-    def block_attr(self, name):
+    def _block_attr(self, name):
        """
        Get the block attribute  by name.

@@ -871,11 +869,11 @@ class Operator(object):
            block: the block attribute.
        """

-        id = self.block_attr_id(name)
+        id = self._block_attr_id(name)
        assert (id >= 0 and id < len(self.block.program.blocks))
        return self.block.program.blocks[id]

-    def blocks_attr(self, name):
+    def _blocks_attr(self, name):
        """
        Get the blocks attribute  by name.

@@ -886,13 +884,13 @@ class Operator(object):
            list: list of the blocks attribute.
        """
        attrs = []
-        for i in self.blocks_attr_ids(name):
+        for i in self._blocks_attr_ids(name):
            assert (i >= 0 and i < len(self.block.program.blocks))
            attrs.append(self.block.program.blocks[i])

        return attrs

-    def blocks_attr_ids(self, name):
+    def _blocks_attr_ids(self, name):
        """
        Get the blocks attribute's ids by name.

@@ -903,7 +901,7 @@ class Operator(object):
            list: list of the blocks ids.
        """

-        return self.desc.blocks_attr_ids(name)
+        return self.desc._blocks_attr_ids(name)

    def all_attrs(self):
        """
@@ -917,11 +915,11 @@ class Operator(object):
        for n in attr_names:
            attr_type = self.desc.attr_type(n)
            if attr_type == core.AttrType.BLOCK:
-                attr_map[n] = self.block_attr(n)
+                attr_map[n] = self._block_attr(n)
                continue

            if attr_type == core.AttrType.BLOCKS:
-                attr_map[n] = self.blocks_attr(n)
+                attr_map[n] = self._blocks_attr(n)
                continue

            attr_map[n] = self.attr(n)
@@ -1795,7 +1793,7 @@ class Program(object):
            for j in six.moves.range(block.op_size()):
                op = block.op(j)
                if op.has_attr('is_test'):
-                    op.set_attr('is_test', True)
+                    op._set_attr('is_test', True)
        res.blocks = [
            Block(res, i) for i in six.moves.range(res.desc.num_blocks())
        ]
@@ -2169,7 +2167,7 @@ def program_guard(main_program, startup_program=None):
        switch_startup_program(startup_program)


-def get_var(name, program=None):
+def _get_var(name, program=None):
    """
    Get a variable by name from the global block of a program.


--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -600,7 +600,7 @@ def save_inference_model(dirname,
    """
    if isinstance(feeded_var_names, six.string_types):
        feeded_var_names = [feeded_var_names]
-    else:
+    elif export_for_deployment:
        if len(feeded_var_names) > 0:
            # TODO(paddle-dev): polish these code blocks
            if not (bool(feeded_var_names) and all(
@@ -610,61 +610,60 @@ def save_inference_model(dirname,

    if isinstance(target_vars, Variable):
        target_vars = [target_vars]
-    else:
+    elif export_for_deployment:
        if not (bool(target_vars) and all(
                isinstance(var, Variable) for var in target_vars)):
            raise ValueError("'target_vars' should be a list of Variable.")

    if main_program is None:
        main_program = default_main_program()
-    copy_program = main_program.clone()
+
+    # if there is lookup table, the trainer 0 will notify all pserver to save.
+    if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
+        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
+        _save_lookup_tables_by_notify(executor, lookup_table_filename,
+                                      main_program._distributed_lookup_table,
+                                      main_program._endpoints)

    if not os.path.isdir(dirname):
        os.makedirs(dirname)
+    if model_filename is not None:
+        model_basename = os.path.basename(model_filename)
+    else:
+        model_basename = "__model__"
+    model_basename = os.path.join(dirname, model_basename)

    # When export_for_deployment is true, we modify the program online so that
    # it can only be loaded for inference directly. If it's false, the whole
    # original program and related meta are saved so that future usage can be
    # more flexible.
    if export_for_deployment:
-        global_block = copy_program.global_block()
+        main_program = main_program.clone()
+        global_block = main_program.global_block()
        for i, op in enumerate(global_block.ops):
            op.desc.set_is_target(False)
            if op.type == "feed" or op.type == "fetch":
                global_block._remove_op(i)
-        copy_program.desc.flush()
+        main_program.desc.flush()

-        pruned_program = copy_program._prune(targets=target_vars)
-        saved_program = pruned_program._inference_optimize(prune_read_op=True)
+        main_program = main_program._prune(targets=target_vars)
+        main_program = main_program._inference_optimize(prune_read_op=True)
        fetch_var_names = [v.name for v in target_vars]

-        prepend_feed_ops(saved_program, feeded_var_names)
-        append_fetch_ops(saved_program, fetch_var_names)
+        prepend_feed_ops(main_program, feeded_var_names)
+        append_fetch_ops(main_program, fetch_var_names)
+
+        with open(model_basename, "wb") as f:
+            f.write(main_program.desc.serialize_to_string())
    else:
        # TODO(panyx0718): Save more information so that it can also be used
        # for training and more flexible post-processing.
-        saved_program = copy_program
-
-    if model_filename is not None:
-        model_filename = os.path.basename(model_filename)
-    else:
-        model_filename = "__model__"
-    model_filename = os.path.join(dirname, model_filename)
+        with open(model_basename + ".main_program", "wb") as f:
+            f.write(main_program.desc.serialize_to_string())

    if params_filename is not None:
        params_filename = os.path.basename(params_filename)
-
-    with open(model_filename, "wb") as f:
-        f.write(saved_program.desc.serialize_to_string())
-
-    save_persistables(executor, dirname, saved_program, params_filename)
-
-    # if there is lookup table, the trainer 0 will notify all pserver to save.
-    if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
-        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
-        _save_lookup_tables_by_notify(executor, lookup_table_filename,
-                                      main_program._distributed_lookup_table,
-                                      main_program._endpoints)
+    save_persistables(executor, dirname, main_program, params_filename)


 def load_inference_model(dirname,

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -284,7 +284,7 @@ def detection_output(loc,
        target_box=loc,
        code_type='decode_center_size')
    compile_shape = scores.shape
-    run_shape = ops.shape(scores)
+    run_shape = nn.shape(scores)
    scores = nn.flatten(x=scores, axis=2)
    scores = nn.softmax(input=scores)
    scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape)
@@ -697,7 +697,7 @@ def ssd_loss(location,
        raise ValueError("Only support mining_type == max_negative now.")

    num, num_prior, num_class = confidence.shape
-    conf_shape = ops.shape(confidence)
+    conf_shape = nn.shape(confidence)

    def __reshape_to_2d(var):
        return nn.flatten(x=var, axis=2)
@@ -724,7 +724,7 @@ def ssd_loss(location,
    target_label.stop_gradient = True
    conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
    # 3. Mining hard examples
-    actual_shape = ops.slice(conf_shape, axes=[0], starts=[0], ends=[2])
+    actual_shape = nn.slice(conf_shape, axes=[0], starts=[0], ends=[2])
    actual_shape.stop_gradient = True
    conf_loss = nn.reshape(
        x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape)

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -507,7 +507,6 @@ def py_reader(capacity,

        1. The basic usage of :code:`py_reader` is as follows:

-        >>> import paddle.v2
        >>> import paddle.fluid as fluid
        >>> import paddle.dataset.mnist as mnist
        >>>
@@ -515,7 +514,7 @@ def py_reader(capacity,
        >>>                                 shapes=[(-1,3,224,224), (-1,1)],
        >>>                                 dtypes=['float32', 'int64'])
        >>> reader.decorate_paddle_reader(
-        >>>     paddle.v2.reader.shuffle(paddle.batch(mnist.train())
+        >>>     paddle.reader.shuffle(paddle.batch(mnist.train())
        >>>
        >>> img, label = fluid.layers.read_file(reader)
        >>> loss = network(img, label) # some network definition
@@ -534,7 +533,6 @@ def py_reader(capacity,
        2. When training and testing are both performed, two different
        :code:`py_reader` should be created with different names, e.g.:

-        >>> import paddle.v2
        >>> import paddle.fluid as fluid
        >>> import paddle.dataset.mnist as mnist
        >>>
@@ -548,7 +546,7 @@ def py_reader(capacity,
        >>>                                       dtypes=['float32', 'int64'],
        >>>                                       name='train_reader')
        >>> train_reader.decorate_paddle_reader(
-        >>>     paddle.v2.reader.shuffle(paddle.batch(mnist.train())
+        >>>     paddle.reader.shuffle(paddle.batch(mnist.train())
        >>>
        >>> test_reader = fluid.layers.py_reader(capacity=32,
        >>>                                      shapes=[(-1,3,224,224), (-1,1)],

--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -78,7 +78,12 @@ def accuracy(input, label, k=1, correct=None, total=None):
    return acc_out


-def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
+def auc(input,
+        label,
+        curve='ROC',
+        num_thresholds=2**12 - 1,
+        topk=1,
+        slide_steps=1):
    """
    **Area Under the Curve (AUC) Layer**

@@ -105,6 +110,8 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
        num_thresholds(int): The number of thresholds to use when discretizing
                             the roc curve. Default 200.
        topk(int): only topk number of prediction output will be used for auc.
+        slide_steps: when calc batch auc, we can not only use step currently but the previous steps can be used. slide_steps=1 means use the current step, slide_steps=3 means use current step and the previous second steps, slide_steps=0 use all of the steps.
+

    Returns:
        Variable: A scalar representing the current AUC.
@@ -120,16 +127,48 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
    auc_out = helper.create_tmp_variable(dtype="float64")
    batch_auc_out = helper.create_tmp_variable(dtype="float64")
    # make tp, tn, fp, fn persistable, so that can accumulate all batches.
+
+    # for batch auc
+    batch_stat_pos = helper.create_global_variable(
+        persistable=True,
+        dtype='int64',
+        shape=[slide_steps, num_thresholds + 1])
+    batch_stat_neg = helper.create_global_variable(
+        persistable=True,
+        dtype='int64',
+        shape=[slide_steps, num_thresholds + 1])
+
+    # for global auc
    stat_pos = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
    stat_neg = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1])

-    for var in [stat_pos, stat_neg]:
+    for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
        helper.set_variable_initializer(
            var, Constant(
                value=0.0, force_cpu=True))

+    # Batch AUC
+    helper.append_op(
+        type="auc",
+        inputs={
+            "Predict": [input],
+            "Label": [label],
+            "StatPos": [batch_stat_pos],
+            "StatNeg": [batch_stat_neg]
+        },
+        attrs={
+            "curve": curve,
+            "num_thresholds": num_thresholds,
+            "slide_steps": slide_steps
+        },
+        outputs={
+            "AUC": [batch_auc_out],
+            "StatPosOut": [batch_stat_pos],
+            "StatNegOut": [batch_stat_neg]
+        })
+    # Global AUC
    helper.append_op(
        type="auc",
        inputs={
@@ -138,12 +177,16 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
            "StatPos": [stat_pos],
            "StatNeg": [stat_neg]
        },
-        attrs={"curve": curve,
-               "num_thresholds": num_thresholds},
+        attrs={
+            "curve": curve,
+            "num_thresholds": num_thresholds,
+            "slide_steps": 0
+        },
        outputs={
            "AUC": [auc_out],
-            "BatchAUC": [batch_auc_out],
            "StatPosOut": [stat_pos],
            "StatNegOut": [stat_neg]
        })
-    return auc_out, batch_auc_out, [stat_pos, stat_neg]
+    return auc_out, batch_auc_out, [
+        batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
+    ]
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -29,110 +29,29 @@ from .. import unique_name
 from functools import reduce

 __all__ = [
-    'fc',
-    'embedding',
-    'dynamic_lstm',
-    'dynamic_lstmp',
-    'dynamic_gru',
-    'gru_unit',
-    'linear_chain_crf',
-    'crf_decoding',
-    'cos_sim',
-    'cross_entropy',
-    'square_error_cost',
-    'chunk_eval',
-    'sequence_conv',
-    'conv2d',
-    'conv3d',
-    'sequence_pool',
-    'sequence_softmax',
-    'softmax',
-    'pool2d',
-    'pool3d',
-    'batch_norm',
-    'beam_search_decode',
-    'conv2d_transpose',
-    'conv3d_transpose',
-    'sequence_expand',
-    'sequence_expand_as',
-    'sequence_pad',
-    'lstm_unit',
-    'reduce_sum',
-    'reduce_mean',
-    'reduce_max',
-    'reduce_min',
-    'reduce_prod',
-    'sequence_first_step',
-    'sequence_last_step',
-    'dropout',
-    'split',
-    'ctc_greedy_decoder',
-    'edit_distance',
-    'l2_normalize',
-    'matmul',
-    'topk',
-    'warpctc',
-    'sequence_reshape',
-    'transpose',
-    'im2sequence',
-    'nce',
-    'hsigmoid',
-    'beam_search',
-    'row_conv',
-    'multiplex',
-    'layer_norm',
-    'softmax_with_cross_entropy',
-    'smooth_l1',
-    'one_hot',
-    'autoincreased_step_counter',
-    'reshape',
-    'squeeze',
-    'unsqueeze',
-    'lod_reset',
-    'lrn',
-    'pad',
-    'pad_constant_like',
-    'label_smooth',
-    'roi_pool',
-    'dice_loss',
-    'image_resize',
-    'image_resize_short',
-    'resize_bilinear',
-    'gather',
-    'scatter',
-    'sequence_scatter',
-    'random_crop',
-    'mean_iou',
-    'relu',
-    'log',
-    'crop',
-    'rank_loss',
-    'elu',
-    'relu6',
-    'pow',
-    'stanh',
-    'hard_sigmoid',
-    'swish',
-    'prelu',
-    'brelu',
-    'leaky_relu',
-    'soft_relu',
-    'flatten',
-    'sequence_mask',
-    'stack',
-    'pad2d',
-    'unstack',
-    'sequence_enumerate',
-    'expand',
-    'sequence_concat',
-    'scale',
-    'elementwise_add',
-    'elementwise_div',
-    'elementwise_sub',
-    'elementwise_mul',
-    'elementwise_max',
-    'elementwise_min',
-    'elementwise_pow',
+    'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru',
+    'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy',
+    'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d', 'conv3d',
+    'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'pool3d',
+    'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'conv3d_transpose',
+    'sequence_expand', 'sequence_expand_as', 'sequence_pad', 'lstm_unit',
+    'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod',
+    'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
+    'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk',
+    'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce',
+    'hsigmoid', 'beam_search', 'row_conv', 'multiplex', 'layer_norm',
+    'softmax_with_cross_entropy', 'smooth_l1', 'one_hot',
+    'autoincreased_step_counter', 'reshape', 'squeeze', 'unsqueeze',
+    'lod_reset', 'lrn', 'pad', 'pad_constant_like', 'label_smooth', 'roi_pool',
+    'dice_loss', 'image_resize', 'image_resize_short', 'resize_bilinear',
+    'gather', 'scatter', 'sequence_scatter', 'random_crop', 'mean_iou', 'relu',
+    'log', 'crop', 'rank_loss', 'elu', 'relu6', 'pow', 'stanh', 'hard_sigmoid',
+    'swish', 'prelu', 'brelu', 'leaky_relu', 'soft_relu', 'flatten',
+    'sequence_mask', 'stack', 'pad2d', 'unstack', 'sequence_enumerate',
+    'expand', 'sequence_concat', 'scale', 'elementwise_add', 'elementwise_div',
+    'elementwise_sub', 'elementwise_mul', 'elementwise_max', 'elementwise_min',
+    'elementwise_pow', 'uniform_random_batch_size_like', 'gaussian_random',
+    'sampling_id', 'gaussian_random_batch_size_like', 'sum', 'slice', 'shape'
 ]


@@ -6463,6 +6382,246 @@ def expand(x, expand_times, name=None):
    return out


+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+
+@templatedoc()
+def uniform_random_batch_size_like(input,
+                                   shape,
+                                   dtype='float32',
+                                   input_dim_idx=0,
+                                   output_dim_idx=0,
+                                   min=-1.0,
+                                   max=1.0,
+                                   seed=0):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${input_comment}
+        shape (tuple|list): ${shape_comment}
+        input_dim_idx (Int): ${input_dim_idx_comment}
+        output_dim_idx (Int): ${output_dim_idx_comment}
+        min (Float): ${min_comment}
+        max (Float): ${max_comment}
+        seed (Int): ${seed_comment}
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
+    Returns:
+        out (Variable): ${out_comment}
+
+    """
+
+    helper = LayerHelper('uniform_random_batch_size_like', **locals())
+    out = helper.create_tmp_variable(dtype)
+    c_dtype = convert_np_dtype_to_dtype_(dtype)
+    helper.append_op(
+        type='uniform_random_batch_size_like',
+        inputs={'Input': input},
+        outputs={'Out': out},
+        attrs={
+            'shape': shape,
+            'input_dim_idx': input_dim_idx,
+            'output_dim_idx': output_dim_idx,
+            'min': min,
+            'max': max,
+            'seed': seed,
+            'dtype': c_dtype
+        })
+
+    return out
+
+
+@templatedoc()
+def gaussian_random(shape,
+                    mean=0.0,
+                    std=1.0,
+                    seed=0,
+                    dtype='float32',
+                    use_mkldnn=False):
+    """
+    ${comment}
+
+    Args:
+        shape (tuple|list): ${shape_comment}
+        mean (Float): ${mean_comment}
+        std (Float): ${std_comment}
+        seed (Int): ${seed_comment}
+        dtype(np.dtype|core.VarDesc.VarType|str): Output data type.
+        use_mkldnn (Bool): Only used in mkldnn kernel.
+
+    Returns:
+        out (Variable): ${out_comment}
+
+    """
+
+    helper = LayerHelper('gaussian_random', **locals())
+    out = helper.create_tmp_variable(dtype)
+    c_dtype = convert_np_dtype_to_dtype_(dtype)
+    helper.append_op(
+        type='gaussian_random',
+        outputs={'Out': out},
+        attrs={
+            'shape': shape,
+            'mean': mean,
+            'std': std,
+            'seed': seed,
+            'dtype': c_dtype,
+            'use_mkldnn': use_mkldnn
+        })
+
+    return out
+
+
+@templatedoc()
+def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
+    """
+    ${comment}
+
+    Args:
+        x (Variable): ${x_comment}
+        min (Float): ${min_comment}
+        max (Float): ${max_comment}
+        seed (Float): ${seed_comment}
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
+
+    Returns:
+        out (Variable): ${out_comment}
+
+    """
+
+    helper = LayerHelper('sampling_id', **locals())
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='sampling_id',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'min': min,
+               'max': max,
+               'seed': seed})
+
+    return out
+
+
+@templatedoc()
+def gaussian_random_batch_size_like(input,
+                                    shape,
+                                    input_dim_idx=0,
+                                    output_dim_idx=0,
+                                    mean=0.0,
+                                    std=1.0,
+                                    seed=0,
+                                    dtype='float32'):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${input_comment}
+        shape (tuple|list): ${shape_comment}
+        input_dim_idx (Int): ${input_dim_idx_comment}
+        output_dim_idx (Int): ${output_dim_idx_comment}
+        mean (Float): ${mean_comment}
+        std (Float): ${std_comment}
+        seed (Int): ${seed_comment}
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
+
+    Returns:
+        out (Variable): ${out_comment}
+    """
+
+    helper = LayerHelper('gaussian_random_batch_size_like', **locals())
+    out = helper.create_tmp_variable(dtype)
+    c_dtype = convert_np_dtype_to_dtype_(dtype)
+    helper.append_op(
+        type='gaussian_random_batch_size_like',
+        inputs={'Input': input},
+        outputs={'Out': out},
+        attrs={
+            'shape': shape,
+            'input_dim_idx': input_dim_idx,
+            'output_dim_idx': output_dim_idx,
+            'mean': mean,
+            'std': std,
+            'seed': seed,
+            'dtype': c_dtype
+        })
+
+    return out
+
+
+@templatedoc()
+def sum(x, use_mkldnn=False):
+    """
+    ${comment}
+
+    Args:
+        x (Variable): ${x_comment}
+        use_mkldnn (Bool): ${use_mkldnn_comment}
+
+    Returns:
+        out (Variable): ${out_comment}
+    """
+
+    helper = LayerHelper('sum', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype('x'))
+    helper.append_op(
+        type='sum',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'use_mkldnn': use_mkldnn})
+
+    return out
+
+
+@templatedoc()
+def slice(input, axes, starts, ends):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${input_comment}.
+        axes (List): ${axes_comment}
+        starts (List): ${starts_comment}
+        ends (List): ${ends_comment}
+
+    Returns:
+        out (Variable): ${out_comment}
+
+    """
+
+    helper = LayerHelper('slice', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype('input'))
+    helper.append_op(
+        type='slice',
+        inputs={'Input': input},
+        outputs={'Out': out},
+        attrs={'axes': axes,
+               'starts': starts,
+               'ends': ends})
+
+    return out
+
+
+@templatedoc()
+def shape(input):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${input_comment}
+
+    Returns:
+        out (Variable): ${out_comment}
+
+    """
+
+    helper = LayerHelper('shape', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype('input'))
+    helper.append_op(
+        type='shape', inputs={'Input': input}, outputs={'Out': out})
+
+    return out
+
+
 def _elementwise_op(helper):
    op_type = helper.layer_type
    x = helper.kwargs.get('x', None)

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -45,13 +45,6 @@ __all__ = [
    'logical_or',
    'logical_xor',
    'logical_not',
-    'uniform_random_batch_size_like',
-    'gaussian_random',
-    'sampling_id',
-    'gaussian_random_batch_size_like',
-    'sum',
-    'slice',
-    'shape',
    'maxout',
 ]

@@ -63,6 +56,8 @@ for _OP in set(__all__):
 # e.g.: test_program_code.py, test_dist_train.py
 globals()['_scale'] = generate_layer_fn('scale')

+globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
+
 __all__ += __activations_noattr__

 for _OP in set(__activations_noattr__):

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -26,6 +26,7 @@ from .layer_helper import LayerHelper
 from .regularizer import append_regularization_ops
 from .clip import append_gradient_clip_ops, error_clip_callback
 from contextlib import contextmanager
+from .layers import ops

 __all__ = [
    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
@@ -1301,7 +1302,7 @@ class ModelAverage(Optimizer):
            x=tmp, dtype='float32' if self._dtype == None else self._dtype)
        sum = layers.cast(
            x=sum, dtype='float32' if self._dtype == None else self._dtype)
-        layers.elementwise_div(x=sum, y=tmp, out=param)
+        ops._elementwise_div(x=sum, y=tmp, out=param)

    def _add_average_restore_op(self, block, param_grad):
        param = block._clone_variable(param_grad[0])

--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -67,6 +67,7 @@ def train(nn_type,
          use_cuda,
          parallel,
          save_dirname=None,
+          save_full_dirname=None,
          model_filename=None,
          params_filename=None,
          is_local=True):
@@ -143,6 +144,13 @@ def train(nn_type,
                                exe,
                                model_filename=model_filename,
                                params_filename=params_filename)
+                        if save_full_dirname is not None:
+                            fluid.io.save_inference_model(
+                                save_full_dirname, [], [],
+                                exe,
+                                model_filename=model_filename,
+                                params_filename=params_filename,
+                                export_for_deployment=False)
                        return
                    else:
                        print(
@@ -214,10 +222,12 @@ def infer(use_cuda,

 def main(use_cuda, parallel, nn_type, combine):
    save_dirname = None
+    save_full_dirname = None
    model_filename = None
    params_filename = None
    if not use_cuda and not parallel:
        save_dirname = "recognize_digits_" + nn_type + ".inference.model"
+        save_full_dirname = "recognize_digits_" + nn_type + ".train.model"
        if combine == True:
            model_filename = "__model_combined__"
            params_filename = "__params_combined__"
@@ -228,6 +238,7 @@ def main(use_cuda, parallel, nn_type, combine):
        use_cuda=use_cuda,
        parallel=parallel,
        save_dirname=save_dirname,
+        save_full_dirname=save_full_dirname,
        model_filename=model_filename,
        params_filename=params_filename)
    infer(

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -28,7 +28,6 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl

 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
 list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test
-
 if(APPLE)
    if(NOT WITH_DISTRIBUTE)
        list(REMOVE_ITEM TEST_OPS test_desc_clone)

--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+
+import dist_ctr_reader
+from test_dist_base import TestDistRunnerBase, runtime_main
+
+IS_SPARSE = True
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+class TestDistCTR2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta()
+        """ network definition """
+        dnn_data = fluid.layers.data(
+            name="dnn_data",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        lr_data = fluid.layers.data(
+            name="lr_data",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        label = fluid.layers.data(
+            name="click",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=0,
+            append_batch_size=False)
+
+        # build dnn model
+        dnn_layer_dims = [128, 64, 32, 1]
+        dnn_embedding = fluid.layers.embedding(
+            is_distributed=False,
+            input=dnn_data,
+            size=[dnn_input_dim, dnn_layer_dims[0]],
+            param_attr=fluid.ParamAttr(
+                name="deep_embedding",
+                initializer=fluid.initializer.Constant(value=0.01)),
+            is_sparse=IS_SPARSE)
+        dnn_pool = fluid.layers.sequence_pool(
+            input=dnn_embedding, pool_type="sum")
+        dnn_out = dnn_pool
+        for i, dim in enumerate(dnn_layer_dims[1:]):
+            fc = fluid.layers.fc(
+                input=dnn_out,
+                size=dim,
+                act="relu",
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Constant(value=0.01)),
+                name='dnn-fc-%d' % i)
+            dnn_out = fc
+
+        # build lr model
+        lr_embbding = fluid.layers.embedding(
+            is_distributed=False,
+            input=lr_data,
+            size=[lr_input_dim, 1],
+            param_attr=fluid.ParamAttr(
+                name="wide_embedding",
+                initializer=fluid.initializer.Constant(value=0.01)),
+            is_sparse=IS_SPARSE)
+        lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum")
+
+        merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)
+
+        predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
+        acc = fluid.layers.accuracy(input=predict, label=label)
+        auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
+                                                              label=label)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        inference_program = paddle.fluid.default_main_program().clone()
+
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+        sgd_optimizer.minimize(avg_cost)
+
+        dataset = dist_ctr_reader.Dataset()
+        train_reader = paddle.batch(dataset.train(), batch_size=batch_size)
+        test_reader = paddle.batch(dataset.test(), batch_size=batch_size)
+
+        return inference_program, avg_cost, train_reader, test_reader, None, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistCTR2x2)
--- a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import paddle
+import tarfile
+
+logging.basicConfig()
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+
+DATA_URL = "http://paddle-ctr-data.cdn.bcebos.com/avazu_ctr_data.tgz"
+DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e"
+"""
+avazu_ctr_data/train.txt
+avazu_ctr_data/infer.txt
+avazu_ctr_data/test.txt
+avazu_ctr_data/data.meta.txt
+"""
+
+
+def read_data(file_name):
+    path = paddle.dataset.common.download(DATA_URL, "avazu_ctr_data", DATA_MD5)
+    tar = tarfile.open(path, "r:gz")
+    tar_info = None
+    for member in tar.getmembers():
+        if member.name.endswith(file_name):
+            tar_info = member
+    f = tar.extractfile(tar_info)
+    ret_lines = [_.decode('utf-8') for _ in f.readlines()]
+    return ret_lines
+
+
+class TaskMode:
+    TRAIN_MODE = 0
+    TEST_MODE = 1
+    INFER_MODE = 2
+
+    def __init__(self, mode):
+        self.mode = mode
+
+    def is_train(self):
+        return self.mode == self.TRAIN_MODE
+
+    def is_test(self):
+        return self.mode == self.TEST_MODE
+
+    def is_infer(self):
+        return self.mode == self.INFER_MODE
+
+    @staticmethod
+    def create_train():
+        return TaskMode(TaskMode.TRAIN_MODE)
+
+    @staticmethod
+    def create_test():
+        return TaskMode(TaskMode.TEST_MODE)
+
+    @staticmethod
+    def create_infer():
+        return TaskMode(TaskMode.INFER_MODE)
+
+
+class ModelType:
+    CLASSIFICATION = 0
+    REGRESSION = 1
+
+    def __init__(self, mode):
+        self.mode = mode
+
+    def is_classification(self):
+        return self.mode == self.CLASSIFICATION
+
+    def is_regression(self):
+        return self.mode == self.REGRESSION
+
+    @staticmethod
+    def create_classification():
+        return ModelType(ModelType.CLASSIFICATION)
+
+    @staticmethod
+    def create_regression():
+        return ModelType(ModelType.REGRESSION)
+
+
+def load_dnn_input_record(sent):
+    return list(map(int, sent.split()))
+
+
+def load_lr_input_record(sent):
+    res = []
+    for _ in [x.split(':') for x in sent.split()]:
+        res.append(int(_[0]))
+    return res
+
+
+feeding_index = {'dnn_input': 0, 'lr_input': 1, 'click': 2}
+
+
+class Dataset(object):
+    def train(self):
+        '''
+        Load trainset.
+        '''
+        file_name = "train.txt"
+        logger.info("load trainset from %s" % file_name)
+        mode = TaskMode.create_train()
+        return self._parse_creator(file_name, mode)
+
+    def test(self):
+        '''
+        Load testset.
+        '''
+        file_name = "test.txt"
+        logger.info("load testset from %s" % file_name)
+        mode = TaskMode.create_test()
+        return self._parse_creator(file_name, mode)
+
+    def infer(self):
+        '''
+        Load infer set.
+        '''
+        file_name = "infer.txt"
+        logger.info("load inferset from %s" % file_name)
+        mode = TaskMode.create_infer()
+        return self._parse_creator(file_name, mode)
+
+    def _parse_creator(self, file_name, mode):
+        '''
+        Parse dataset.
+        '''
+
+        def _parse():
+            data = read_data(file_name)
+            for line_id, line in enumerate(data):
+                fs = line.strip().split('\t')
+                dnn_input = load_dnn_input_record(fs[0])
+                lr_input = load_lr_input_record(fs[1])
+                if not mode.is_infer():
+                    click = int(fs[2])
+                    yield [dnn_input, lr_input, click]
+                else:
+                    yield [dnn_input, lr_input]
+
+        return _parse
+
+
+def load_data_meta():
+    '''
+    load data meta info from path, return (dnn_input_dim, lr_input_dim)
+    '''
+    lines = read_data('data.meta.txt')
+    err_info = "wrong meta format"
+    assert len(lines) == 2, err_info
+    assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[
+        1], err_info
+    res = map(int, [_.split(':')[1] for _ in lines])
+    res = list(res)
+    logger.info('dnn input dim: %d' % res[0])
+    logger.info('lr input dim: %d' % res[1])
+    return res
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -47,7 +47,7 @@ def cnn_model(data):
        pool_stride=2,
        act="relu",
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.3)))
+            value=0.01)))
    conv_pool_2 = fluid.nets.simple_img_conv_pool(
        input=conv_pool_1,
        filter_size=5,
@@ -56,7 +56,7 @@ def cnn_model(data):
        pool_stride=2,
        act="relu",
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.2)))
+            value=0.01)))

    SIZE = 10
    input_shape = conv_pool_2.shape
@@ -68,7 +68,7 @@ def cnn_model(data):
        size=SIZE,
        act="softmax",
        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.1)))
+            initializer=fluid.initializer.Constant(value=0.01)))
    return predict



--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+import random
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+
+DTYPE = "int64"
+DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000'
+DATA_MD5 = '24e49366eb0611c552667989de2f57d5'
+
+# For Net
+base_lr = 0.2
+emb_lr = base_lr * 3
+dict_dim = 1500
+emb_dim = 128
+hid_dim = 128
+margin = 0.1
+sample_rate = 1
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+def get_acc(cos_q_nt, cos_q_pt, batch_size):
+    cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
+    cond = fluid.layers.cast(cond, dtype='float64')
+    cond_3 = fluid.layers.reduce_sum(cond)
+    acc = fluid.layers.elementwise_div(
+        cond_3,
+        fluid.layers.fill_constant(
+            shape=[1], value=batch_size * 1.0, dtype='float64'),
+        name="simnet_acc")
+    return acc
+
+
+def get_loss(cos_q_pt, cos_q_nt):
+    loss_op1 = fluid.layers.elementwise_sub(
+        fluid.layers.fill_constant_batch_size_like(
+            input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'),
+        cos_q_pt)
+    loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
+    loss_op3 = fluid.layers.elementwise_max(
+        fluid.layers.fill_constant_batch_size_like(
+            input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+        loss_op2)
+    avg_cost = fluid.layers.mean(loss_op3)
+    return avg_cost
+
+
+def get_optimizer():
+    # SGD optimizer
+    optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
+    return optimizer
+
+
+def train_network(batch_size, is_distributed=False, is_sparse=False):
+    # query
+    q = fluid.layers.data(
+        name="query_ids", shape=[1], dtype="int64", lod_level=1)
+    ## embedding
+    q_emb = fluid.layers.embedding(
+        input=q,
+        is_distributed=is_distributed,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__emb__",
+            learning_rate=emb_lr),
+        is_sparse=is_sparse)
+    ## vsum
+    q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
+    q_ss = fluid.layers.softsign(q_sum)
+    ## fc layer after conv
+    q_fc = fluid.layers.fc(
+        input=q_ss,
+        size=hid_dim,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__q_fc__",
+            learning_rate=base_lr))
+    # label data
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    # pt
+    pt = fluid.layers.data(
+        name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+    ## embedding
+    pt_emb = fluid.layers.embedding(
+        input=pt,
+        is_distributed=is_distributed,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__emb__",
+            learning_rate=emb_lr),
+        is_sparse=is_sparse)
+    ## vsum
+    pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
+    pt_ss = fluid.layers.softsign(pt_sum)
+    ## fc layer
+    pt_fc = fluid.layers.fc(
+        input=pt_ss,
+        size=hid_dim,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__fc__",
+            learning_rate=base_lr),
+        bias_attr=fluid.ParamAttr(name="__fc_b__"))
+    # nt
+    nt = fluid.layers.data(
+        name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+    ## embedding
+    nt_emb = fluid.layers.embedding(
+        input=nt,
+        is_distributed=is_distributed,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__emb__",
+            learning_rate=emb_lr),
+        is_sparse=is_sparse)
+    ## vsum
+    nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
+    nt_ss = fluid.layers.softsign(nt_sum)
+    ## fc layer
+    nt_fc = fluid.layers.fc(
+        input=nt_ss,
+        size=hid_dim,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__fc__",
+            learning_rate=base_lr),
+        bias_attr=fluid.ParamAttr(name="__fc_b__"))
+    cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
+    cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
+    # loss
+    avg_cost = get_loss(cos_q_pt, cos_q_nt)
+    # acc
+    acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
+    return [avg_cost, acc, cos_q_pt]
+
+
+def combination(x, y):
+    res = [[[xi, yi] for yi in y] for xi in x]
+    return res[0]
+
+
+def get_one_data(file_list):
+    for file in file_list:
+        contents = []
+        with open(file, "r") as fin:
+            for i in fin:
+                contents.append(i.strip())
+            for index, q in enumerate(contents):
+                try:
+                    one_data = [[int(j) for j in i.split(" ")]
+                                for i in q.split(";")[:-1]]
+                    if one_data[1][0] + one_data[1][1] != len(one_data) - 3:
+                        q = fin.readline()
+                        continue
+                    tmp = combination(one_data[3:3 + one_data[1][0]],
+                                      one_data[3 + one_data[1][0]:])
+                except Exception as e:
+                    continue
+
+                for each in tmp:
+                    yield [one_data[2], 0, each[0], each[1]]
+
+
+def get_batch_reader(file_list, batch_size):
+    def batch_reader():
+        res = []
+        for i in get_one_data(file_list):
+            if random.random() <= sample_rate:
+                res.append(i)
+            if len(res) >= batch_size:
+                yield res
+                res = []
+
+    return batch_reader
+
+
+def get_train_reader(batch_size):
+    # The training data set.
+    train_file = os.path.join(paddle.dataset.common.DATA_HOME, "simnet",
+                              "train")
+    train_reader = get_batch_reader([train_file], batch_size)
+    train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"]
+    return train_reader, train_feed
+
+
+class TestDistSimnetBow2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        # Train program
+        avg_cost, acc, predict = \
+            train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"])))
+
+        inference_program = fluid.default_main_program().clone()
+
+        # Optimization
+        opt = get_optimizer()
+        opt.minimize(avg_cost)
+
+        # Reader
+        train_reader, _ = get_train_reader(batch_size)
+        return inference_program, avg_cost, train_reader, train_reader, acc, predict
+
+
+if __name__ == "__main__":
+    paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
+    runtime_main(TestDistSimnetBow2x2)
--- a/python/paddle/fluid/tests/unittests/dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+import six
+import tarfile
+import string
+import re
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+
+DTYPE = "float32"
+VOCAB_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/imdb.vocab'
+VOCAB_MD5 = '23c86a0533c0151b6f12fa52b106dcc2'
+DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/text_classification.tar.gz'
+DATA_MD5 = '29ebfc94f11aea9362bbb7f5e9d86b8a'
+
+
+# Load dictionary.
+def load_vocab(filename):
+    vocab = {}
+    if six.PY2:
+        with open(filename, 'r') as f:
+            for idx, line in enumerate(f):
+                vocab[line.strip()] = idx
+    else:
+        with open(filename, 'r', encoding="utf-8") as f:
+            for idx, line in enumerate(f):
+                vocab[line.strip()] = idx
+    return vocab
+
+
+def get_worddict(dict_path):
+    word_dict = load_vocab(dict_path)
+    word_dict["<unk>"] = len(word_dict)
+    dict_dim = len(word_dict)
+    return word_dict, dict_dim
+
+
+def conv_net(input,
+             dict_dim,
+             emb_dim=128,
+             window_size=3,
+             num_filters=128,
+             fc0_dim=96,
+             class_dim=2):
+    emb = fluid.layers.embedding(
+        input=input,
+        size=[dict_dim, emb_dim],
+        is_sparse=False,
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=num_filters,
+        filter_size=window_size,
+        act="tanh",
+        pool_type="max",
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01)))
+
+    fc_0 = fluid.layers.fc(
+        input=[conv_3],
+        size=fc0_dim,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01)))
+
+    prediction = fluid.layers.fc(
+        input=[fc_0],
+        size=class_dim,
+        act="softmax",
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01)))
+
+    return prediction
+
+
+def inference_network(dict_dim):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    out = conv_net(data, dict_dim)
+    return out
+
+
+def get_reader(word_dict, batch_size):
+    # The training data set.
+    train_reader = paddle.batch(train(word_dict), batch_size=batch_size)
+
+    # The testing data set.
+    test_reader = paddle.batch(test(word_dict), batch_size=batch_size)
+
+    return train_reader, test_reader
+
+
+def get_optimizer(learning_rate):
+    optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
+    return optimizer
+
+
+class TestDistTextClassification2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        vocab = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "text_classification", "imdb.vocab")
+        word_dict, dict_dim = get_worddict(vocab)
+
+        # Input data
+        data = fluid.layers.data(
+            name="words", shape=[1], dtype="int64", lod_level=1)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        # Train program
+        predict = conv_net(data, dict_dim)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=predict, label=label)
+        inference_program = fluid.default_main_program().clone()
+
+        # Optimization
+        opt = get_optimizer(learning_rate=0.001)
+        opt.minimize(avg_cost)
+
+        # Reader
+        train_reader, test_reader = get_reader(word_dict, batch_size)
+
+        return inference_program, avg_cost, train_reader, test_reader, acc, predict
+
+
+def tokenize(pattern):
+    """
+    Read files that match the given pattern.  Tokenize and yield each file.
+    """
+
+    with tarfile.open(
+            paddle.dataset.common.download(DATA_URL, 'text_classification',
+                                           DATA_MD5)) as tarf:
+        # Note that we should use tarfile.next(), which does
+        # sequential access of member files, other than
+        # tarfile.extractfile, which does random access and might
+        # destroy hard disks.
+        tf = tarf.next()
+        while tf != None:
+            if bool(pattern.match(tf.name)):
+                # newline and punctuations removal and ad-hoc tokenization.
+                yield tarf.extractfile(tf).read().rstrip(six.b(
+                    "\n\r")).translate(
+                        None, six.b(string.punctuation)).lower().split()
+            tf = tarf.next()
+
+
+def reader_creator(pos_pattern, neg_pattern, word_idx):
+    UNK = word_idx['<unk>']
+    INS = []
+
+    def load(pattern, out, label):
+        for doc in tokenize(pattern):
+            out.append(([word_idx.get(w, UNK) for w in doc], label))
+
+    load(pos_pattern, INS, 0)
+    load(neg_pattern, INS, 1)
+
+    def reader():
+        for doc, label in INS:
+            yield doc, label
+
+    return reader
+
+
+def train(word_idx):
+    """
+    IMDB training set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("train/pos/.*\.txt$"),
+        re.compile("train/neg/.*\.txt$"), word_idx)
+
+
+def test(word_idx):
+    """
+    IMDB test set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("test/pos/.*\.txt$"),
+        re.compile("test/neg/.*\.txt$"), word_idx)
+
+
+if __name__ == "__main__":
+    paddle.dataset.common.download(VOCAB_URL, 'text_classification', VOCAB_MD5)
+    paddle.dataset.common.download(DATA_URL, 'text_classification', DATA_MD5)
+    runtime_main(TestDistTextClassification2x2)
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -1488,7 +1488,7 @@ def wrap_decoder(trg_vocab_size,
    if weight_sharing:
        predict = layers.matmul(
            x=dec_output,
-            y=fluid.get_var(word_emb_param_names[0]),
+            y=fluid.framework._get_var(word_emb_param_names[0]),
            transpose_y=True)
    else:
        predict = layers.fc(input=dec_output,
@@ -1699,10 +1699,9 @@ class DistTransformer2x2(TestDistRunnerBase):
        exe.run(startup_prog)
        exe.run(pserver_prog)

-    def run_trainer(self, use_cuda, args):
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        TrainTaskConfig.use_gpu = use_cuda
-        sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model(
+    def run_trainer(self, args):
+        TrainTaskConfig.use_gpu = args.use_cuda
+        sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
            args.is_dist, not args.sync_mode)

        if args.is_dist:
@@ -1718,6 +1717,11 @@ class DistTransformer2x2(TestDistRunnerBase):
            TrainTaskConfig.batch_size = 20
            trainer_prog = fluid.default_main_program()

+        if args.use_cuda:
+            place = fluid.CUDAPlace(0)
+        else:
+            place = fluid.CPUPlace()
+
        startup_exe = fluid.Executor(place)

        TrainTaskConfig.local = not args.is_dist

--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
@@ -122,4 +122,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase):


 if __name__ == "__main__":
+    import os
+    os.environ['CPU_NUM'] = '1'
+    os.environ['USE_CUDA'] = "FALSE"
    runtime_main(TestDistWord2vec2x2)
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -345,7 +345,7 @@ class OpTest(unittest.TestCase):
                        actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                    "Output (" + out_name + ") has diff at " + str(place) +
                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
-                    str(actual_t))
+                    str(actual_t) + " in class " + self.__class__.__name__)
                if isinstance(expect, tuple):
                    self.assertListEqual(actual.recursive_sequence_lengths(),
                                         expect[1], "Output (" + out_name +

--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
@@ -36,7 +36,11 @@ class TestAucOp(OpTest):
            "StatPos": stat_pos,
            "StatNeg": stat_neg
        }
-        self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
+        self.attrs = {
+            'curve': 'ROC',
+            'num_thresholds': num_thresholds,
+            "slide_steps": 1
+        }

        python_auc = metrics.Auc(name="auc",
                                 curve='ROC',
@@ -45,7 +49,6 @@ class TestAucOp(OpTest):

        self.outputs = {
            'AUC': np.array(python_auc.eval()),
-            'BatchAUC': np.array(python_auc.eval()),
            'StatPosOut': np.array(python_auc._stat_pos),
            'StatNegOut': np.array(python_auc._stat_neg)
        }

--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -20,6 +20,7 @@ import six
 import sys
 import collections
 import math
+import paddle.fluid as fluid
 from op_test import OpTest


@@ -32,7 +33,7 @@ class TestDetectionMAPOp(OpTest):
        self.detect = np.array(self.detect).astype('float32')
        self.mAP = np.array(self.mAP).astype('float32')

-        if (len(self.class_pos_count) > 0):
+        if len(self.class_pos_count) > 0:
            self.class_pos_count = np.array(self.class_pos_count).astype(
                'int32')
            self.true_pos = np.array(self.true_pos).astype('float32')
@@ -273,7 +274,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp):
 class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
    def init_test_case(self):
        super(TestDetectionMAPOpMultiBatch, self).init_test_case()
-        self.class_pos_count = [0, 2, 1]
+        self.class_pos_count = [0, 2, 1, 0]
        self.true_pos_lod = [[0, 3, 2]]
        self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
        self.false_pos_lod = [[0, 3, 2]]

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -18,23 +18,27 @@ import time
 import unittest
 import os
 import sys
-import six
 import signal
 import subprocess
+import six
 import argparse

+import paddle.fluid as fluid
+
+RUN_STEP = 10
+

 class TestDistRunnerBase(object):
    def get_model(self, batch_size=2):
        raise NotImplementedError(
            "get_model should be implemented by child classes.")

-    def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
-                       trainers, sync_mode):
+    @staticmethod
+    def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers,
+                       sync_mode):
        # NOTE: import fluid until runtime, or else forking processes will cause error.
-        import paddle
-        import paddle.fluid as fluid
-        t = fluid.DistributeTranspiler()
+        config = fluid.DistributeTranspilerConfig()
+        t = fluid.DistributeTranspiler(config=config)
        t.transpile(
            trainer_id=trainer_id,
            program=main_program,
@@ -44,11 +48,9 @@ class TestDistRunnerBase(object):
        return t

    def run_pserver(self, args):
-        import paddle
-        import paddle.fluid as fluid
+
        self.get_model(batch_size=2)
-        if args.mem_opt:
-            fluid.memory_optimize(fluid.default_main_program())
+        # NOTE: pserver should not call memory optimize
        t = self.get_transpiler(args.trainer_id,
                                fluid.default_main_program(), args.endpoints,
                                args.trainers, args.sync_mode)
@@ -61,29 +63,34 @@ class TestDistRunnerBase(object):
        exe.run(startup_prog)
        exe.run(pserver_prog)

-    def run_trainer(self, use_cuda, args):
-        import paddle
-        import paddle.fluid as fluid
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    def run_trainer(self, args):
        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
            self.get_model(batch_size=2)
+
        if args.mem_opt:
-            fluid.memory_optimize(fluid.default_main_program())
+            fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
        if args.is_dist:
            t = self.get_transpiler(args.trainer_id,
                                    fluid.default_main_program(),
                                    args.endpoints, args.trainers,
                                    args.sync_mode)
+
            trainer_prog = t.get_trainer_program()
        else:
            trainer_prog = fluid.default_main_program()

+        if args.use_cuda:
+            place = fluid.CUDAPlace(0)
+        else:
+            place = fluid.CPUPlace()
+
        startup_exe = fluid.Executor(place)
        startup_exe.run(fluid.default_startup_program())

        strategy = fluid.ExecutionStrategy()
        strategy.num_threads = 1
        strategy.allow_op_delay = False
+
        build_stra = fluid.BuildStrategy()

        if args.use_reduce:
@@ -92,7 +99,7 @@ class TestDistRunnerBase(object):
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce

        exe = fluid.ParallelExecutor(
-            use_cuda,
+            args.use_cuda,
            loss_name=avg_cost.name,
            exec_strategy=strategy,
            build_strategy=build_stra)
@@ -103,27 +110,26 @@ class TestDistRunnerBase(object):
        ]

        feeder = fluid.DataFeeder(feed_var_list, place)
-        reader_generator = test_reader()
-
-        data = next(reader_generator)
-        first_loss, = exe.run(fetch_list=[avg_cost.name],
-                              feed=feeder.feed(data))
-        print(first_loss)
+        reader_generator = train_reader()

-        for i in six.moves.xrange(5):
-            data = next(reader_generator)
-            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
+        def get_data():
+            origin_batch = next(reader_generator)
+            if args.is_dist and args.use_reader_alloc:
+                new_batch = []
+                for offset, item in enumerate(origin_batch):
+                    if offset % 2 == args.trainer_id:
+                        new_batch.append(item)
+                return new_batch
+            else:
+                return origin_batch

-        data = next(reader_generator)
-        last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
-        print(last_loss)
+        for _ in six.moves.xrange(RUN_STEP):
+            loss, = exe.run(fetch_list=[avg_cost.name],
+                            feed=feeder.feed(get_data()))
+            print(loss)


 def runtime_main(test_class):
-    import paddle
-    import paddle.fluid as fluid
-    import paddle.fluid.core as core
-
    parser = argparse.ArgumentParser(description='Run dist test.')
    parser.add_argument(
        '--role', type=str, required=True, choices=['pserver', 'trainer'])
@@ -135,7 +141,10 @@ def runtime_main(test_class):
        '--current_endpoint', type=str, required=False, default="")
    parser.add_argument('--sync_mode', action='store_true')
    parser.add_argument('--mem_opt', action='store_true')
+    parser.add_argument('--use_cuda', action='store_true')
    parser.add_argument('--use_reduce', action='store_true')
+    parser.add_argument(
+        '--use_reader_alloc', action='store_true', required=False, default=True)

    args = parser.parse_args()

@@ -143,8 +152,7 @@ def runtime_main(test_class):
    if args.role == "pserver" and args.is_dist:
        model.run_pserver(args)
    else:
-        use_cuda = True if core.is_compiled_with_cuda() else False
-        model.run_trainer(use_cuda, args)
+        model.run_trainer(args)


 import paddle.compat as cpt
@@ -163,8 +171,10 @@ class TestDistBase(unittest.TestCase):
            self._find_free_port(), self._find_free_port())
        self._python_interp = "python"
        self._sync_mode = True
+        self._use_cuda = True
        self._mem_opt = False
        self._use_reduce = False
+        self._use_reader_alloc = True
        self._setup_config()

    def _find_free_port(self):
@@ -172,15 +182,15 @@ class TestDistBase(unittest.TestCase):
            s.bind(('', 0))
            return s.getsockname()[1]

-    def start_pserver(self, model_file, check_error_log):
+    def start_pserver(self, model_file, check_error_log, required_envs):
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
        ps0_cmd = ps_cmd % \
-            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
-             self._trainers)
+                  (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
+                   self._trainers)
        ps1_cmd = ps_cmd % \
-            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
-             self._trainers)
+                  (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
+                   self._trainers)

        if self._sync_mode:
            ps0_cmd += " --sync_mode"
@@ -198,9 +208,15 @@ class TestDistBase(unittest.TestCase):
            ps1_pipe = open("/tmp/ps1_err.log", "wb")

        ps0_proc = subprocess.Popen(
-            ps0_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe)
+            ps0_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps0_pipe,
+            env=required_envs)
        ps1_proc = subprocess.Popen(
-            ps1_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe)
+            ps1_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps1_pipe,
+            env=required_envs)

        if not check_error_log:
            return ps0_proc, ps1_proc, None, None
@@ -222,59 +238,60 @@ class TestDistBase(unittest.TestCase):
                                 (e, retry_times))
                retry_times -= 1

-    def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
-        # TODO(typhoonzero): should auto adapt GPU count on the machine.
-        required_envs = {
-            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
-            "FLAGS_cudnn_deterministic": "1",
-            "CPU_NUM": "1"
-        }
+    def _run_local(self, model, envs, check_error_log):

-        if check_error_log:
-            required_envs["GLOG_v"] = "7"
-            required_envs["GLOG_logtostderr"] = "1"
+        cmd = "%s %s --role trainer" % (self._python_interp, model)
+
+        if self._use_cuda:
+            cmd += " --use_cuda"
+            env_local = {"CUDA_VISIBLE_DEVICES": "0"}
+        else:
+            env_local = {'CPU_NUM': '1'}
+
+        envs.update(env_local)

-        # Run local to get a base line
-        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
-        env_local.update(required_envs)
-        local_cmd = "%s %s --role trainer" % (self._python_interp, model_file)
        if not check_error_log:
+            err_log = open("/tmp/trainer.err.log", "wb")
            local_proc = subprocess.Popen(
-                local_cmd.split(" "),
+                cmd.split(" "),
                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                env=env_local)
+                stderr=err_log,
+                env=envs)
        else:
-            err_log = open("/tmp/trainer.err.log", "wb")
            local_proc = subprocess.Popen(
-                local_cmd.split(" "),
+                cmd.split(" "),
                stdout=subprocess.PIPE,
-                stderr=err_log,
-                env=env_local)
+                stderr=subprocess.PIPE,
+                env=envs)

        local_proc.wait()
-        out, err = local_proc.communicate()
-        local_ret = cpt.to_text(out)
-        sys.stderr.write('local_loss: %s\n' % local_ret)
-        sys.stderr.write('local_stderr: %s\n' % err)
+        local_out, local_err = local_proc.communicate()
+        local_ret = cpt.to_text(local_out)
+
+        if check_error_log:
+            err_log.close()
+
+        sys.stderr.write('local_stdout: %s\n' % local_ret)
+        sys.stderr.write('local_stderr: %s\n' % local_err)

+        local_losses = local_ret.split("\n")
+        return local_losses
+
+    def _run_cluster(self, model, envs, check_error_log):
        # Run dist train to compare with local results
-        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file,
-                                                          check_error_log)
+        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
+                                                          check_error_log, envs)
        self._wait_ps_ready(ps0.pid)
        self._wait_ps_ready(ps1.pid)
-
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+
        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
        tr0_cmd = tr_cmd % \
-            (self._python_interp, model_file, self._ps_endpoints,
-             0, ps0_ep, self._trainers)
+                  (self._python_interp, model, self._ps_endpoints,
+                   0, ps0_ep, self._trainers)
        tr1_cmd = tr_cmd % \
-            (self._python_interp, model_file, self._ps_endpoints,
-             1, ps1_ep, self._trainers)
+                  (self._python_interp, model, self._ps_endpoints,
+                   1, ps1_ep, self._trainers)

        if self._sync_mode:
            tr0_cmd += " --sync_mode"
@@ -285,18 +302,28 @@ class TestDistBase(unittest.TestCase):
        if self._use_reduce:
            tr0_cmd += " --use_reduce"
            tr1_cmd += " --use_reduce"
+        if self._use_reader_alloc:
+            tr0_cmd += " --use_reader_alloc"
+            tr1_cmd += " --use_reader_alloc"
+        if self._use_cuda:
+            tr0_cmd += " --use_cuda"
+            tr1_cmd += " --use_cuda"
+            env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+            env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        else:
+            env0 = {'CPU_NUM': '1'}
+            env1 = {'CPU_NUM': '1'}
+
+        env0.update(envs)
+        env1.update(envs)

-        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
-        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
-        env0.update(required_envs)
-        env1.update(required_envs)
        FNULL = open(os.devnull, 'w')

        tr0_pipe = subprocess.PIPE
        tr1_pipe = subprocess.PIPE
        if check_error_log:
-            print("tr0_cmd:", tr0_cmd)
-            print("tr1_cmd:", tr1_cmd)
+            print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
+            print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
            tr0_pipe = open("/tmp/tr0_err.log", "wb")
            tr1_pipe = open("/tmp/tr1_err.log", "wb")

@@ -313,17 +340,11 @@ class TestDistBase(unittest.TestCase):

        tr0_proc.wait()
        tr1_proc.wait()
-        out, err = tr0_proc.communicate()
-        sys.stderr.write('dist_stderr: %s\n' % err)
-        loss_data0 = cpt.to_text(out)
-        sys.stderr.write('dist_loss: %s\n' % loss_data0)
-        lines = loss_data0.split("\n")
-        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
-        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
-
-        local_lines = local_ret.split("\n")
-        local_first_loss = eval(local_lines[0])[0]
-        local_last_loss = eval(local_lines[1])[0]
+
+        tr0_out, tr0_err = tr0_proc.communicate()
+        tr0_loss_text = cpt.to_text(tr0_out)
+        tr1_out, tr1_err = tr1_proc.communicate()
+        tr1_loss_text = cpt.to_text(tr1_out)

        # close trainer file
        if check_error_log:
@@ -341,5 +362,47 @@ class TestDistBase(unittest.TestCase):
        ps1.wait()
        FNULL.close()

-        self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
-        self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)
+        # print log
+        sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
+        sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text)
+        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+
+        tr0_losses = tr0_loss_text.split("\n")
+        tr1_losses = tr1_loss_text.split("\n")
+
+        return tr0_losses, tr1_losses
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        # TODO(typhoonzero): should auto adapt GPU count on the machine.
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_cudnn_deterministic": "1",
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "7"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        local_losses\
+            = self._run_local(model_file, required_envs,
+                                       check_error_log)
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs,
+                                                   check_error_log)
+
+        for step_id in range(RUN_STEP):
+            local_loss = eval(local_losses[step_id])[0]
+            tr0_loss = eval(tr0_losses[step_id])[0]
+            tr1_loss = eval(tr1_losses[step_id])[0]
+            dist_loss = (tr0_loss + tr1_loss) / 2
+            print(str(local_loss) + ":" + str(dist_loss))
+            self.assertAlmostEqual(local_loss, dist_loss, delta=delta)
--- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import os
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistCTR2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_cuda = False
+
+    def test_dist_ctr(self):
+        self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -23,7 +23,7 @@ class TestDistMnist2x2(TestDistBase):
        self._use_reduce = False

    def test_dist_train(self):
-        self.check_with_place("dist_mnist.py", delta=1e-7)
+        self.check_with_place("dist_mnist.py", delta=1e-5)


 class TestDistMnist2x2WithMemopt(TestDistBase):
@@ -32,7 +32,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase):
        self._mem_opt = True

    def test_dist_train(self):
-        self.check_with_place("dist_mnist.py", delta=1e-7)
+        self.check_with_place("dist_mnist.py", delta=1e-5)


 class TestDistMnistAsync(TestDistBase):

--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -20,24 +20,25 @@ from test_dist_base import TestDistBase
 class TestDistSeResneXt2x2(TestDistBase):
    def _setup_config(self):
        self._sync_mode = True
+        self._use_reader_alloc = False

    def test_dist_train(self):
-        self.check_with_place("dist_se_resnext.py", delta=1e-7)
+        self.check_with_place("dist_se_resnext.py", delta=100)


-# TODO(typhoonzero): fix this test
-# class TestDistseResnXt2x2WithMemopt(TestDistBase):
-#     def _setup_config(self):
-#         self._sync_mode = True
-#         self._mem_opt = True
+class TestDistseResnXt2x2WithMemopt(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._mem_opt = True

-#     def test_dist_train(self):
-#         self.check_with_place("dist_se_resnext.py", delta=1e-7)
+    def test_dist_train(self):
+        self.check_with_place("dist_se_resnext.py", delta=100)


 class TestDistSeResneXt2x2Async(TestDistBase):
    def _setup_config(self):
        self._sync_mode = False
+        self._use_reader_alloc = False

    def test_dist_train(self):
        self.check_with_place("dist_se_resnext.py", delta=100)

--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import os
+import unittest
+
+from test_dist_base import TestDistBase
+
+
+class TestDistSimnetBowDense2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_cuda = False
+
+    def test_simnet_bow(self):
+        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=1e-5,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+class TestDistSimnetBow2x2DenseAsync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._use_cuda = False
+
+    def test_simnet_bow(self):
+        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=100,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+class TestDistSimnetBowSparse2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_cuda = False
+
+    def test_simnet_bow(self):
+        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=1e-5,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+class TestDistSimnetBow2x2SparseAsync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._use_cuda = False
+
+    def test_simnet_bow(self):
+        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=100,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistTextClassification2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_cuda = False
+
+    def test_text_classification(self):
+        self.check_with_place("dist_text_classification.py", delta=1e-6)
+
+
+class TestDistTextClassification2x2Async(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._use_cuda = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_text_classification.py", delta=100)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -264,6 +264,25 @@ class TestLRDecay(TranspilerTest):
        ])


+class TestDecayedAdagrad(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        opt = fluid.optimizer.DecayedAdagrad(learning_rate=0.1)
+        opt.minimize(avg_cost)
+
+    def transpiler_test_impl(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer, _ = self.get_trainer()
+
+
 class TestLRDecayConditional(TranspilerTest):
    def net_conf(self):
        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')

--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -39,7 +39,7 @@ class TestDistW2V2x2Async(TestDistBase):
        self._sync_mode = False

    def test_dist_train(self):
-        self.check_with_place("dist_word2vec.py", delta=1)
+        self.check_with_place("dist_word2vec.py", delta=100)


 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -277,7 +277,6 @@ class TestGenerateProposalsOp(OpTest):
            'eta': self.eta
        }

-        print("lod = ", self.lod)
        self.outputs = {
            'RpnRois': (self.rpn_rois[0], [self.lod]),
            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod])
@@ -295,7 +294,7 @@ class TestGenerateProposalsOp(OpTest):
        self.post_nms_topN = 5000  # train 6000, test 1000
        self.nms_thresh = 0.7
        self.min_size = 3.0
-        self.eta = 0.8
+        self.eta = 1.

    def init_test_input(self):
        batch_size = 1

--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
@@ -76,8 +76,8 @@ class TestInferShape(unittest.TestCase):
        mul_op_desc.set_input("X", ["x"])
        mul_op_desc.set_input("Y", ["y"])
        mul_op_desc.set_output("Out", ["out"])
-        mul_op_desc.set_attr("x_num_col_dims", 1)
-        mul_op_desc.set_attr("y_num_col_dims", 1)
+        mul_op_desc._set_attr("x_num_col_dims", 1)
+        mul_op_desc._set_attr("y_num_col_dims", 1)

        mul_op_desc.check_attrs()
        mul_op_desc.infer_shape(block)

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -541,7 +541,7 @@ class TestBook(unittest.TestCase):
        with program_guard(program):
            input = layers.data(
                name="input", shape=[3, 100, 100], dtype="float32")
-            out = layers.shape(input, name="shape")
+            out = layers.shape(input)
            self.assertIsNotNone(out)
        print(str(program))

@@ -758,6 +758,65 @@ class TestBook(unittest.TestCase):
            out = layers.expand(x, [1, 2])
        print(str(program))

+    def test_uniform_random_batch_size_like(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+            out = layers.uniform_random_batch_size_like(input, [-1, 11])
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_gaussian_random(self):
+        program = Program()
+        with program_guard(program):
+            out = layers.gaussian_random(shape=[20, 30])
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_sampling_id(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(
+                name="X",
+                shape=[13, 11],
+                dtype='float32',
+                append_batch_size=False)
+
+            out = layers.sampling_id(x)
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_gaussian_random_batch_size_like(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+
+            out = layers.gaussian_random_batch_size_like(
+                input, shape=[-1, 11], mean=1.0, std=2.0)
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_sum(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+
+            out = layers.sum(input)
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_slice(self):
+        starts = [1, 0, 2]
+        ends = [3, 3, 4]
+        axes = [0, 1, 2]
+
+        program = Program()
+        with program_guard(program):
+            input = layers.data(
+                name="input", shape=[3, 4, 5, 6], dtype='float32')
+
+            out = layers.slice(input, axes=axes, starts=starts, ends=ends)
+
    def test_softshrink(self):
        program = Program()
        with program_guard(program):

--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+import unittest
+import os
+import sys
+import math
+
+
+def simple_fc_net():
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = img
+    for _ in range(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestPassBuilder(unittest.TestCase):
+    def check_network_convergence(self, use_cuda, build_strategy=None):
+        os.environ['CPU_NUM'] = str(4)
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = simple_fc_net()
+            test_program = main.clone(for_test=True)
+
+            opt = fluid.optimizer.SGD(learning_rate=0.001)
+            opt.minimize(loss)
+
+            batch_size = 32
+            image = np.random.normal(size=(batch_size, 784)).astype('float32')
+            label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
+
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup)
+            feed_dict = {'image': image, 'label': label}
+
+            train_exe = fluid.ParallelExecutor(
+                use_cuda=use_cuda,
+                loss_name=loss.name,
+                main_program=main,
+                build_strategy=build_strategy)
+
+            test_exe = fluid.ParallelExecutor(
+                use_cuda=use_cuda,
+                main_program=test_program,
+                share_vars_from=train_exe,
+                build_strategy=build_strategy)
+
+            for i in range(5):
+                test_loss, = test_exe.run([loss.name], feed=feed_dict)
+
+                train_loss, = train_exe.run([loss.name], feed=feed_dict)
+
+                avg_test_loss_val = np.array(test_loss).mean()
+                if math.isnan(float(avg_test_loss_val)):
+                    sys.exit("got NaN loss, testing failed.")
+
+                avg_train_loss_val = np.array(train_loss).mean()
+                if math.isnan(float(avg_train_loss_val)):
+                    sys.exit("got NaN loss, training failed.")
+
+                self.assertTrue(
+                    np.allclose(
+                        train_loss, test_loss, atol=1e-8),
+                    "Train loss: " + str(train_loss) + "\n Test loss:" +
+                    str(test_loss))
+
+    def test_parallel_testing_with_new_strategy(self):
+        build_strategy = fluid.BuildStrategy()
+        pass_builder = build_strategy._create_passes_from_strategy()
+        origin_len = len(pass_builder.all_passes())
+
+        viz_pass = pass_builder.append_pass("graph_viz_pass")
+        self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
+
+        pass_builder.insert_pass(
+            len(pass_builder.all_passes()), "graph_viz_pass")
+        self.assertEqual(origin_len + 2, len(pass_builder.all_passes()))
+
+        pass_builder.remove_pass(len(pass_builder.all_passes()) - 1)
+        self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
+        viz_pass.set_str("graph_viz_path", "/tmp/test_viz_pass")
+
+        self.check_network_convergence(
+            use_cuda=core.is_compiled_with_cuda(),
+            build_strategy=build_strategy)
+        try:
+            os.stat("/tmp/test_viz_pass")
+        except os.error:
+            self.assertFalse(True)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -38,40 +38,40 @@ class TestOpDesc(unittest.TestCase):
        self.assertEqual(['z'], op.output("Out"))
        self.assertEqual(["Out"], op.output_names())

-        op.set_attr("int_attr", 1)
+        op._set_attr("int_attr", 1)
        self.assertEqual(1, op.attr("int_attr"))
        self.assertTrue(op.has_attr("int_attr"))
        self.assertEqual(core.AttrType.INT, op.attr_type("int_attr"))

-        op.set_attr("float_attr", -1.32)
+        op._set_attr("float_attr", -1.32)
        self.assertAlmostEqual(-1.32, op.attr("float_attr"), delta=1e-4)
        self.assertTrue(op.has_attr("float_attr"))

-        op.set_attr("bool_attr", False)
+        op._set_attr("bool_attr", False)
        self.assertFalse(op.attr("bool_attr"))

-        op.set_attr("string_attr", "abc")
+        op._set_attr("string_attr", "abc")
        self.assertEqual("abc", op.attr("string_attr"))
        self.assertTrue(op.has_attr("string_attr"))

-        op.set_attr("ints_attr", [1, 2, 3])
+        op._set_attr("ints_attr", [1, 2, 3])
        self.assertEqual([1, 2, 3], op.attr("ints_attr"))

        expected = [1.2, 2.3, 3.4]
-        op.set_attr("floats_attr", expected)
+        op._set_attr("floats_attr", expected)
        for e, a in zip(expected, op.attr("floats_attr")):
            self.assertAlmostEqual(e, a, delta=1e-4)

-        op.set_attr("strings_attr", ["a", "b", "c"])
+        op._set_attr("strings_attr", ["a", "b", "c"])
        self.assertEqual(["a", "b", "c"], op.attr("strings_attr"))

-        op.set_attr("bools_attr", [True, False, True])
+        op._set_attr("bools_attr", [True, False, True])
        self.assertEqual([True, False, True], op.attr("bools_attr"))

        self.assertEqual(8, len(op.attr_names()))

-        op.set_block_attr("block_attr", program_desc.block(0))
-        self.assertEqual(0, op.block_attr_id("block_attr"))
+        op.set_block_attr("_block_attr", program_desc.block(0))
+        self.assertEqual(0, op._block_attr_id("_block_attr"))

        mul_op = block.append_op()
        mul_op.set_type("mul")

--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -246,6 +246,7 @@ def prepare_encoder(src_word,
        padding_idx=pos_pad_idx,
        param_attr=fluid.ParamAttr(
            name=pos_enc_param_name, trainable=False))
+    src_pos_enc.stop_gradient = True
    enc_input = src_word_emb + src_pos_enc

    # FIXME(guosheng): Decouple the program desc with batch_size.

--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -20,6 +20,10 @@ from .memory_optimization_transpiler import memory_optimize, release_memory
 from .ps_dispatcher import HashName, RoundRobin

 __all__ = [
-    "DistributeTranspiler", "memory_optimize", "release_memory", "HashName",
-    "RoundRobin", "DistributeTranspilerConfig"
+    "DistributeTranspiler",
+    "memory_optimize",
+    "release_memory",
+    "HashName",
+    "RoundRobin",
+    "DistributeTranspilerConfig",
 ]
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -128,7 +128,7 @@ def op_to_code(op):
        attr_type = op.desc.attr_type(name)
        if attr_type == core.AttrType.BLOCK:
            a = "{name} = block[{value}]".format(
-                name=name, type=attr_type, value=op.block_attr_id(name))
+                name=name, type=attr_type, value=op._block_attr_id(name))
            attrs_str += a
            if i != len(attr_names) - 1:
                attrs_str += ", "
@@ -136,7 +136,7 @@ def op_to_code(op):

        if attr_type == core.AttrType.BLOCKS:
            a = "{name} = blocks{value}".format(
-                name=name, type=attr_type, value=op.blocks_attr_ids(name))
+                name=name, type=attr_type, value=op._blocks_attr_ids(name))
            attrs_str += a
            if i != len(attr_names) - 1:
                attrs_str += ", "

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -39,8 +39,8 @@ import six
 from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
 from ..framework import Program, default_main_program, \
-                        default_startup_program, Block, \
-                        Parameter, grad_var_name
+    default_startup_program, Block, \
+    Parameter, grad_var_name
 from .details import *
 from functools import reduce

@@ -178,7 +178,7 @@ class DistributeTranspiler(object):
                                                                pserver_program)
           elif role == "TRAINER":
                trainer_program = t.get_trainer_program()
-           
+
           # for nccl2 mode
           config = fluid.DistributeTranspilerConfig()
           config.mode = "nccl2"
@@ -470,7 +470,10 @@ class DistributeTranspiler(object):
        """
        # remove optimize ops and add a send op to main_program
        # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
+        lr_ops = self._get_lr_ops()
        delete_ops(self.origin_program.global_block(), self.optimize_ops)
+        delete_ops(self.origin_program.global_block(), lr_ops)
+
        self.origin_program.__str__()

        if wait_port:
@@ -534,7 +537,7 @@ class DistributeTranspiler(object):
            })

        for varname, splited_var in six.iteritems(self.param_var_mapping):
-            #add concat ops to merge splited parameters received from parameter servers.
+            # add concat ops to merge splited parameters received from parameter servers.
            if len(splited_var) <= 1:
                continue
            # NOTE: if enable memory optimization, origin vars maybe removed.
@@ -668,7 +671,7 @@ in a single call.")
                __clone_lr_op_sub_block__(cloned_op, program, new_sub_block)

            # reset the block of op
-            op.set_attr('sub_block', new_sub_block)
+            op._set_attr('sub_block', new_sub_block)

        # append lr decay ops to the child block if exists
        lr_ops = self._get_lr_ops()
@@ -734,19 +737,14 @@ in a single call.")
            table_opt_block = self._create_table_optimize_block(
                pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
            optimize_blocks.append(table_opt_block)
-            prefetch_var_name_to_block_id = self._create_prefetch_block(
+            lookup_table_var_name_to_block_id = self._create_prefetch_block(
                pserver_index, pserver_program, table_opt_block)
            checkpoint_block_id = self._create_checkpoint_save_block(
                pserver_program, table_opt_block.idx)

            pserver_program._distributed_lookup_table = self.table_name
-
-        # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
-        # not be executed, so it's safe to use optimize_block to hold the place
-        if self.has_distributed_lookup_table:
-            assert len(prefetch_var_name_to_block_id) > 0
-        else:
-            assert len(prefetch_var_name_to_block_id) == 0
+            prefetch_var_name_to_block_id.extend(
+                lookup_table_var_name_to_block_id)

        attrs = {
            "optimize_blocks": optimize_blocks,
@@ -755,11 +753,14 @@ in a single call.")
            "sync_mode": self.sync_mode,
            "grad_to_block_id": grad_to_block_id,
        }
-        if len(prefetch_var_name_to_block_id) > 0:
-            attrs['prefetch_var_name_to_block_id'] \
-                = prefetch_var_name_to_block_id
+
+        if self.has_distributed_lookup_table:
            attrs['checkpint_block_id'] = checkpoint_block_id

+        if len(prefetch_var_name_to_block_id) > 0:
+            attrs[
+                'prefetch_var_name_to_block_id'] = prefetch_var_name_to_block_id
+
        # step5 append the listen_and_serv op
        pserver_program.global_block().append_op(
            type="listen_and_serv",
@@ -864,7 +865,7 @@ to transpile() call.")
                if op.type in [
                        "gaussian_random", "fill_constant", "uniform_random"
                ]:
-                    op.set_attr("shape", list(new_outputs["Out"].shape))
+                    op._set_attr("shape", list(new_outputs["Out"].shape))
                s_prog.global_block().append_op(
                    type=op.type,
                    inputs=new_inputs,
@@ -1013,7 +1014,7 @@ to transpile() call.")
        for g, p in zip(grad_blocks, param_blocks):
            g_name, g_bid, _ = g.split(":")
            p_name, p_bid, _ = p.split(":")
-            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
+            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \
                self.param_var_mapping[p_name][int(p_bid)]

        # create mapping of endpoint -> split var to create pserver side program
@@ -1320,7 +1321,7 @@ to transpile() call.")
            if len(splited) == 1:
                if self.sync_mode and add_trainer_suffix:
                    new_var_name = "%s.trainer_%d" % \
-                        (orig_var.name, self.trainer_id)
+                                   (orig_var.name, self.trainer_id)
                    program.global_block()._rename_var(varname, new_var_name)
                    var_mapping[varname] = \
                        [program.global_block().var(new_var_name)]
@@ -1343,10 +1344,10 @@ to transpile() call.")
                new_var_name = ""
                if self.sync_mode and add_trainer_suffix:
                    new_var_name = "%s.block%d.trainer_%d" % \
-                        (varname, i, self.trainer_id)
+                                   (varname, i, self.trainer_id)
                else:
                    new_var_name = "%s.block%d" % \
-                        (varname, i)
+                                   (varname, i)
                var = program.global_block().create_var(
                    name=new_var_name,
                    persistable=False,
@@ -1430,6 +1431,9 @@ to transpile() call.")
        elif op_type == "rmsprop":
            if varkey in ["Moment", "MeanSquare"]:
                return param_shape
+        elif op_type == "decayed_adagrad":
+            if varkey == "Moment":
+                return param_shape
        elif op_type == "sgd":
            pass
        return orig_shape
@@ -1484,9 +1488,8 @@ to transpile() call.")
            vars2merge = []
            for i in range(self.trainer_num):
                per_trainer_name = "%s.trainer_%d" % \
-                (merged_var_name, i)
+                                   (merged_var_name, i)
                vars2merge.append(pserver_block.vars[per_trainer_name])
-
            optimize_block.append_op(
                type="sum",
                inputs={"X": vars2merge},
@@ -1645,7 +1648,7 @@ to transpile() call.")
        # one op's output is another op's input, we say
        # the two operator is connected.
        if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \
-           set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()):
+                set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()):
            return True
        return False

@@ -1662,7 +1665,7 @@ to transpile() call.")

    def _is_optimizer_op(self, op):
        if "Param" in op.input_names and \
-            "LearningRate" in op.input_names:
+                "LearningRate" in op.input_names:
            return True
        return False

@@ -1737,7 +1740,7 @@ to transpile() call.")
                # NOTE: we need to skip all optimize ops, since it is connected
                # with forward/backward ops and lr ops, we only need the lr ops.
                if op1 != op2 and self._is_op_connected(op1, op2) and \
-                    not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2):
+                        not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2):
                    ufind.union(op1, op2)
        # find all ops which is related with lr var
        for op1 in block.ops:

--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -163,7 +163,7 @@ class InferenceTranspiler(object):
                next_op = self.block.ops[i + 1]
                if next_op.type == 'relu':
                    # modify bnorm OP to include relu
-                    current_op.set_attr("fuse_with_relu", True)
+                    current_op._set_attr("fuse_with_relu", True)
                    # remove relu OP
                    self.block._remove_op(i + 1)
            i = i + 1
@@ -377,7 +377,7 @@ class InferenceTranspiler(object):
                type=old_var.type,
                dtype=old_var.dtype,
                shape=old_var.shape)
-            op.rename_input(old_param_name, new_param_name)
+            op._rename_input(old_param_name, new_param_name)
            self.scope.var(new_param_name)

            tensor = self.scope.find_var(new_param_name).get_tensor()
@@ -463,8 +463,8 @@ class InferenceTranspiler(object):
            current_op = self.block.ops[i]
            for input_arg in current_op.input_arg_names:
                if input_arg in self.input_map:
-                    current_op.rename_input(input_arg,
-                                            self.input_map[input_arg])
+                    current_op._rename_input(input_arg,
+                                             self.input_map[input_arg])

    def _remove_unused_var(self):
        '''

--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -14,10 +14,10 @@

 from __future__ import print_function

-from collections import defaultdict, OrderedDict, Callable
+from collections import defaultdict, MutableSet
 from .. import core
 from ... import compat as cpt
-from ..framework import Program, default_main_program, Parameter, Variable
+from ..framework import Program, default_main_program, Parameter, Variable, core
 from ..backward import _rename_arg_
 from functools import reduce
 from six.moves import range
@@ -44,17 +44,82 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
 PRINT_LOG = False


+class OrderedSet(MutableSet):
+    def __init__(self, iterable=None):
+        self.end = end = []
+        end += [None, end, end]  # sentinel node for doubly linked list
+        self.map = {}  # key --> [key, prev, next]
+        if iterable is not None:
+            self |= iterable
+
+    def __len__(self):
+        return len(self.map)
+
+    def __contains__(self, key):
+        return key in self.map
+
+    def add(self, key):
+        if key not in self.map:
+            end = self.end
+            curr = end[1]
+            curr[2] = end[1] = self.map[key] = [key, curr, end]
+
+    def update(self, other):
+        for e in other:
+            self.add(e)
+
+    def discard(self, key):
+        if key in self.map:
+            key, prev, next = self.map.pop(key)
+            prev[2] = next
+            next[1] = prev
+
+    def remove(self, key):
+        self.discard(key)
+
+    def __iter__(self):
+        end = self.end
+        curr = end[2]
+        while curr is not end:
+            yield curr[0]
+            curr = curr[2]
+
+    def __reversed__(self):
+        end = self.end
+        curr = end[1]
+        while curr is not end:
+            yield curr[0]
+            curr = curr[1]
+
+    def pop(self, last=True):
+        if not self:
+            raise KeyError('set is empty')
+        key = self.end[1][0] if last else self.end[2][0]
+        self.discard(key)
+        return key
+
+    def __repr__(self):
+        if not self:
+            return '%s()' % (self.__class__.__name__, )
+        return '%s(%r)' % (self.__class__.__name__, list(self))
+
+    def __eq__(self, other):
+        if isinstance(other, OrderedSet):
+            return len(self) == len(other) and list(self) == list(other)
+        return set(self) == set(other)
+
+
 class ControlFlowGraph(object):
    def __init__(self, program, ops, forward_num, skip_opt):
        self._program = program
        self._ops = ops
        self._forward_num = forward_num
-        self._successors = defaultdict(set)
-        self._presuccessors = defaultdict(set)
-        self._uses = defaultdict(set)
-        self._defs = defaultdict(set)
-        self._live_in = defaultdict(set)
-        self._live_out = defaultdict(set)
+        self._successors = defaultdict(OrderedSet)
+        self._presuccessors = defaultdict(OrderedSet)
+        self._uses = defaultdict(OrderedSet)
+        self._defs = defaultdict(OrderedSet)
+        self._live_in = defaultdict(OrderedSet)
+        self._live_out = defaultdict(OrderedSet)
        self._skip_opt = skip_opt
        self.pool = []

@@ -116,7 +181,7 @@ class ControlFlowGraph(object):
        # NOTE: must sort the in_diff set for cases that get different cache var.
        # FIXME(typhoonzero): maybe use a "sorted set" is better than this.
        can_optimize = [
-            x for x in sorted(list(in_diff))
+            x for x in in_diff
            if self._check_var_validity(block_desc, x, is_forward)
        ]
        if can_optimize:
@@ -224,7 +289,7 @@ class ControlFlowGraph(object):
            if self.pool:
                # NOTE: must sort the in_diff set for cases that get different cache var.
                defs_can_optimize = [
-                    x for x in sorted(list(self._defs[i]))
+                    x for x in self._defs[i]
                    if self._check_var_validity(block_desc, x, is_forward)
                ]
                out_pair = [
@@ -381,7 +446,19 @@ def _get_cfgs(input_program):
    return cfgs


-def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
+def _is_opt_role_op(op):
+    op_maker = core.op_proto_and_checker_maker
+    optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
+    if op_maker.kOpRoleAttrName() in op.attr_names and \
+            int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+        return True
+
+
+def memory_optimize(input_program,
+                    skip_opt_set=None,
+                    print_log=False,
+                    level=0,
+                    skip_grads=False):
    """Optimize memory by reusing var memory.

      Note: it doesn't not support subblock nested in subblock.
@@ -398,6 +475,19 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
        raise ValueError("only support opt_level 0 or 1.")
    global PRINT_LOG
    PRINT_LOG = print_log
+    if skip_grads:
+        grad_set = set()
+        OP_ROLE_VAR = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+        for op in input_program.global_block().ops:
+            if _is_opt_role_op(op):
+                if op.attr(OP_ROLE_VAR):
+                    grad_name = op.attr(OP_ROLE_VAR)[1]
+                    grad_set.add(grad_name)
+        if not skip_opt_set:
+            skip_opt_set = grad_set
+        else:
+            skip_opt_set.update(grad_set)
+
    cfgs = _get_cfgs(input_program)
    for cfg in cfgs:
        cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -106,6 +106,7 @@ packages=['paddle',
          'paddle.fluid.layers',
          'paddle.fluid.contrib',
          'paddle.fluid.contrib.decoder',
+          'paddle.fluid.contrib.quantize',
          'paddle.fluid.transpiler',
          'paddle.fluid.transpiler.details']