Merge pull request #1 from PaddlePaddle/develop

Sync with origin

Merge pull request #1 from PaddlePaddle/develop
Sync with origin
92c562b8 · Wang Guibao · GitHub · 55467169 · 643b6faa · 92c562b8
116 changed file
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -62,8 +62,26 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif()

 if(WIN32)
-  # windows stupid compile option for all targets.
+  # windows header option for all targets.
  add_definitions(-D_XKEYCHECK_H)
+  # Use symbols instead of absolute path, reduce the cmake link command length. 
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1)
+  SET(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@")
+  SET(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@")
+
+  # Specify the program to use when building static libraries
+  SET(CMAKE_C_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
+  SET(CMAKE_CXX_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
+
+  # set defination for the dll export
+  if (NOT MSVC)
+    message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
+  endif(NOT MSVC)
 endif(WIN32)

 if(NOT WITH_GOLANG)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -27,7 +27,6 @@ endfunction()

 CheckCompilerCXX11Flag()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
@@ -71,6 +70,20 @@ macro(safe_set_nvflag flag_name)
    endif()
 endmacro()

+macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
+    if (BUILD_SHARED_LIBS) 
+        return() # if build shared libs, the flags keep same with '/MD'
+    endif(BUILD_SHARED_LIBS)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()

 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
@@ -97,9 +110,13 @@ SET(CMAKE_EXTRA_INCLUDE_FILES "")

 # Common flags. the compiler flag used for C/C++ sources whenever release or debug
 # Do not care if this flag is support for gcc.
+
+# https://github.com/PaddlePaddle/Paddle/issues/12773
+if (NOT WIN32)
 set(COMMON_FLAGS
    -fPIC
    -fno-omit-frame-pointer
+    -Werror
    -Wall
    -Wextra
    -Wnon-virtual-dtor
@@ -114,11 +131,6 @@ set(COMMON_FLAGS
    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
 )

-# https://github.com/PaddlePaddle/Paddle/issues/12773
-if (NOT WIN32)
-list(APPEND COMMON_FLAGS -Werror)
-endif()
-
 set(GPU_COMMON_FLAGS
    -fPIC
    -fno-omit-frame-pointer
@@ -133,30 +145,53 @@ set(GPU_COMMON_FLAGS
    -Wno-error=array-bounds # Warnings in Eigen::array
 )

+else(NOT WIN32)
+set(COMMON_FLAGS
+    "/w") #disable all warnings.
+set(GPU_COMMON_FLAGS
+    "/w") #disable all warnings
+endif(NOT WIN32)
+
 if (APPLE)
    if(NOT CMAKE_CROSSCOMPILING)
        # On Mac OS X build fat binaries with x86_64 architectures by default.
        set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
    endif()
-else()
+endif(APPLE)
+
+if(LINUX)
    set(GPU_COMMON_FLAGS
        -Wall
        -Wextra
        -Werror
        ${GPU_COMMON_FLAGS})
-endif()
+endif(LINUX)

 if(UNIX AND NOT APPLE)
  # except apple from nix*Os family
  set(LINUX TRUE)
 endif(UNIX AND NOT APPLE)

-
 foreach(flag ${COMMON_FLAGS})
    safe_set_cflag(CMAKE_C_FLAGS ${flag})
    safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
+
 endforeach()

 foreach(flag ${GPU_COMMON_FLAGS})
    safe_set_nvflag(${flag})
 endforeach()
+
+if(WIN32)
+# windows build turn off warnings.
+safe_set_static_flag()
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/W3")
+        string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/W3")
+    endforeach(flag_var)
+endif(WIN32)
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
 add_custom_target(paddle_apis ALL
-                  DEPENDS paddle_v2_apis paddle_fluid_apis)
+                  DEPENDS paddle_v2_apis)

 add_custom_target(paddle_docs ALL
                  DEPENDS paddle_v2_docs paddle_v2_docs_cn
-                  paddle_fluid_docs paddle_fluid_docs_cn
                  paddle_mobile_docs paddle_mobile_docs_cn)

 add_subdirectory(v2)
-add_subdirectory(fluid)
 add_subdirectory(mobile)
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@@ -102,8 +102,8 @@ class Float16Transpiler:
                continue
            for input_arg in current_op.input_arg_names:
                if input_arg in self.input_map:
-                    current_op.rename_input(input_arg,
-                                            self.input_map[input_arg])
+                    current_op._rename_input(input_arg,
+                                             self.input_map[input_arg])

    def _remove_unused_var(self):
        '''
@@ -187,7 +187,7 @@ class Float16Transpiler:
                    shape=var.shape,
                    persistable=var.persistable)
                find_op(var)
-                var.op.rename_output(var_name, tmp_var_name)
+                var.op._rename_output(var_name, tmp_var_name)
                self.block._insert_op(
                    i,
                    type="cast",

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -6,26 +6,9 @@ paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=
 paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
-paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.output ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
-paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@@ -170,6 +153,13 @@ paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_
 paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
 paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
 paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0))
+paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32', False))
+paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32'))
+paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32'))
+paddle.fluid.layers.sum ArgSpec(args=['x', 'use_mkldnn'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -241,13 +231,6 @@ paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwarg
 paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.sampling_id ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -286,7 +269,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
-paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
+paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -315,6 +298,7 @@ paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs
 paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
 paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -13,3 +13,5 @@ if(WITH_INFERENCE)
  # NOTE: please add subdirectory inference at last.
  add_subdirectory(inference)
 endif()
+
+add_subdirectory(train)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -150,11 +150,10 @@ else()
 endif()
 
 if (NOT WIN32)
-  cc_library(parallel_executor SRCS parallel_executor.cc DEPS
-          threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
-          graph graph_viz_pass multi_devices_graph_pass
-          multi_devices_graph_print_pass multi_devices_graph_check_pass
-          fast_threaded_ssa_graph_executor fuse_elewise_add_act_pass)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS
+        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
+        graph build_strategy
+        fast_threaded_ssa_graph_executor)
 endif() # NOT WIN32

 cc_library(prune SRCS prune.cc DEPS framework_proto)

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -54,3 +54,8 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
 #        device_context reduce_op_handle )
 cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
+
+cc_library(build_strategy SRCS build_strategy.cc DEPS
+        graph_viz_pass multi_devices_graph_pass
+        multi_devices_graph_print_pass multi_devices_graph_check_pass
+        fuse_elewise_add_act_pass)
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/details/build_strategy.h"
+
+#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ParallelExecutorPassBuilder : public ir::PassBuilder {
+ public:
+  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
+      : ir::PassBuilder(), strategy_(strategy) {
+    // Add a graph viz pass to record a graph.
+    if (!strategy_.debug_graphviz_path_.empty()) {
+      auto viz_pass = AppendPass("graph_viz_pass");
+      const std::string graph_path = string::Sprintf(
+          "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
+      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    }
+
+    // Add op fusion.
+    if (strategy.fuse_elewise_add_act_ops_) {
+      auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
+      // Add a graph viz pass to record a graph.
+      if (!strategy.debug_graphviz_path_.empty()) {
+        auto viz_pass = AppendPass("graph_viz_pass");
+        const std::string graph_path = string::Sprintf(
+            "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
+        viz_pass->Set<std::string>("graph_viz_path",
+                                   new std::string(graph_path));
+      }
+    }
+
+    // Convert graph to run on multi-devices.
+    auto multi_devices_pass = AppendPass("multi_devices_pass");
+    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
+                                                         &strategy_);
+
+    // Add a graph print pass to record a graph with device info.
+    if (!strategy_.debug_graphviz_path_.empty()) {
+      auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
+      multi_devices_print_pass->SetNotOwned<const std::string>(
+          "debug_graphviz_path", &strategy_.debug_graphviz_path_);
+      multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
+          "graph_printer", new details::GraphvizSSAGraphPrinter);
+    }
+
+    // Verify that the graph is correct for multi-device executor.
+    AppendPass("multi_devices_check_pass");
+  }
+
+ private:
+  BuildStrategy strategy_;
+};
+
+std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy()
+    const {
+  pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
+  return pass_builder_;
+}
+
+std::unique_ptr<ir::Graph> BuildStrategy::Apply(
+    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &param_names,
+    const std::vector<Scope *> &local_scopes,
+#ifdef PADDLE_WITH_CUDA
+    const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
+#else
+    const bool use_cuda) const {
+#endif
+  // Create a default one if not initialized by user.
+  if (!pass_builder_) {
+    CreatePassesFromStrategy();
+  }
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
+
+  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
+    if (pass->Type() == "multi_devices_pass") {
+      pass->Erase("places");
+      pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
+      pass->Erase("loss_var_name");
+      pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
+      pass->Erase("params");
+      pass->SetNotOwned<const std::unordered_set<std::string>>("params",
+                                                               &param_names);
+      pass->Erase("local_scopes");
+      pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+                                                    &local_scopes);
+#ifdef PADDLE_WITH_CUDA
+      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+      pass->Erase("nccl_ctxs");
+      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
+#endif
+    }
+    graph = pass->Apply(std::move(graph));
+  }
+  return graph;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(fuse_elewise_add_act_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(multi_devices_pass);
+USE_PASS(multi_devices_check_pass);
+USE_PASS(multi_devices_print_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -15,6 +15,17 @@
 #pragma once

 #include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/pass_builder.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif

 namespace paddle {
 namespace framework {
@@ -57,6 +68,30 @@ struct BuildStrategy {
  bool fuse_elewise_add_act_ops_{false};

  bool enable_data_balance_{false};
+
+  // User normally doesn't need to call this API.
+  // The PassBuilder allows for more customized insert, remove of passes
+  // from python side.
+  // A new PassBuilder is created based on configs defined above and
+  // passes are owned by the PassBuilder.
+  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const;
+
+  // Apply the passes built by the pass_builder_. The passes will be
+  // applied to the Program and output an ir::Graph.
+  std::unique_ptr<ir::Graph> Apply(
+      const ProgramDesc &main_program,
+      const std::vector<platform::Place> &places,
+      const std::string &loss_var_name,
+      const std::unordered_set<std::string> &param_names,
+      const std::vector<Scope *> &local_scopes,
+#ifdef PADDLE_WITH_CUDA
+      const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const;
+#else
+      const bool use_cuda) const;
+#endif
+
+ private:
+  mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
 };

 }  // namespace details

--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
@@ -20,79 +20,37 @@ namespace paddle {
 namespace framework {
 namespace details {

-// Change it to thread safe flags if needed.
-class ThreadUnsafeOwnershipFlags {
+template <class T>
+class COWPtr {
 public:
-  explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
-
-  ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
-  ThreadUnsafeOwnershipFlags& operator=(
-      const ThreadUnsafeOwnershipFlags& other) = delete;
-  ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
-
-  void SetOwnership(bool flag) { flag_ = flag; }
-
-  // Invoke the callback if it is not owned.
-  template <typename Callback>
-  void AcquireOwnershipOnce(Callback acquire) {
-    if (!flag_) {
-      acquire();
-      flag_ = true;
-    }
-  }
+  typedef std::shared_ptr<T> RefPtr;

 private:
-  bool flag_;
-};
+  RefPtr m_sp;

-// Copy-On-Write pointer.
-// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
-//
-// The template parameter OwnershipFlags should have:
-//   * a constructor takes a bool. True if own.
-//   * SetOwnership(bool flag).
-//   * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
-//     owned.
-//
-// https://en.wikipedia.org/wiki/Copy-on-write
-template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
-class COWPtr {
 public:
-  // Ctor from raw pointer.
-  explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
+  COWPtr() : m_sp(nullptr) {}
+  explicit COWPtr(T* t) : m_sp(t) {}

-  // Move methods. Steal ownership from origin
-  COWPtr(COWPtr&& other)
-      : payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
-  COWPtr& operator=(COWPtr&& origin) = default;
+  const T& Data() const { return *m_sp; }

-  // Copy methods. Not own payload
-  COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
-  COWPtr& operator=(const COWPtr& other) {
-    payload_ = other.payload_;
-    ownership_.SetOwnership(false);
-    return *this;
-  }
-
-  // Access read only data.
-  const T& Data() const { return *payload_; }
-
-  // Access mutable data. If the data is not owned, the data will be copied
-  // before.
  T* MutableData() {
-    ownership_.AcquireOwnershipOnce(
-        [this] { payload_.reset(new T(*payload_)); });
-    return payload_.get();
+    DetachIfNotUnique();
+    return m_sp.get();
  }

- private:
-  // Actual data pointer.
-  std::shared_ptr<T> payload_;
+  void DetachIfNotUnique() {
+    T* tmp = m_sp.get();
+    if (!(tmp == nullptr || m_sp.unique())) {
+      Detach();
+    }
+  }

-  // Ownership flag.
-  OwnershipFlags ownership_;
+  void Detach() {
+    T* tmp = m_sp.get();
+    m_sp = RefPtr(new T(*tmp));
+  }
 };
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/cow_ptr_test.cc
+++ b/paddle/fluid/framework/details/cow_ptr_test.cc
@@ -30,6 +30,14 @@ TEST(COWPtr, all) {
  ASSERT_EQ(ptr2.Data(), 10);
 }

+TEST(COWPtr, change_old) {
+  COWPtr<int> ptr(new int{0});
+  COWPtr<int> ptr2 = ptr;
+  *ptr.MutableData() = 10;
+  ASSERT_EQ(ptr2.Data(), 0);
+  ASSERT_EQ(ptr.Data(), 10);
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -41,6 +41,8 @@ cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass

 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")

+cc_library(pass_builder SRCS pass_builder.cc DEPS pass)
+
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)

--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -257,6 +257,22 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  PDPattern external_pattern, subblock_pattern;

+  // Use the following variables to tell whether this model is RNN1.
+  // This fuse can only works on the RNN1 model.
+  std::unordered_set<std::string> specified_vars({"data_lod_attention",
+                                                  "cell_init", "hidden_init",
+                                                  "data", "week", "minute"});
+  int count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsVar() && specified_vars.count(node->Name())) {
+      ++count;
+    }
+  }
+  if (count < specified_vars.size()) {
+    return graph;
+  }
+
+  // Continue to fuse.
  FindWhileOp(graph.get());
  return graph;
 }

--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -26,8 +26,6 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
  PADDLE_ENFORCE(graph.get());
  FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get());

-  std::unordered_set<Node*> nodes2delete;
-
  GraphPatternDetector gpd;
  auto* conv_input = gpd.mutable_pattern()
                         ->NewNode("conv_relu_mkldnn_fuse/conv_input")
@@ -42,36 +40,20 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
                     Graph* g) {
    VLOG(4) << "handle ConvReLU fuse";
    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
-                              conv_relu_pattern);  // Filter
-    GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern);  // Bias
-    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern);    // tmp
+                              conv_relu_pattern);                      // Filter
+    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern);  // tmp
    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern);  // CONV op
    GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern);  // Out
    GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern);  // ReLU op

-    // Create an ConvReLU Node.
-    OpDesc desc;
-    std::string conv_relu_i_in = subgraph.at(conv_input)->Name();
-    std::string conv_relu_w_in = conv_weight->Name();
-    std::string conv_relu_b_in = conv_bias->Name();
-    std::string conv_relu_out = relu_out->Name();
-    desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
-    desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
-    desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
-    desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
-    desc.SetType("conv2d");
-    for (auto& attr : conv->Op()->GetAttrMap()) {
-      desc.SetAttr(attr.first, attr.second);
-    }
-    desc.SetAttr("fuse_relu", true);
-    auto conv_relu_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
-    GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out});
+    // Transform Conv node into ConvReLU node.
+    OpDesc* desc = conv->Op();
+    desc->SetOutput("Output", std::vector<std::string>({relu_out->Name()}));
+    desc->SetAttr("fuse_relu", true);
+    GraphSafeRemoveNodes(graph.get(), {relu, conv_out});

    PADDLE_ENFORCE(subgraph.count(conv_input));
-    IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node);
-    IR_NODE_LINK_TO(conv_weight, conv_relu_node);
-    IR_NODE_LINK_TO(conv_bias, conv_relu_node);
-    IR_NODE_LINK_TO(conv_relu_node, relu_out);
+    IR_NODE_LINK_TO(conv, relu_out);

    found_conv_relu_count++;
  };

--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -85,16 +85,13 @@ TEST(ConvReLUFusePass, basic) {

  for (auto* node : graph->Nodes()) {
    if (node->IsOp() && node->Op()->Type() == "conv2d") {
-      if (node->Op()->HasAttr("use_mkldnn")) {
-        bool use_mkldnn = boost::get<bool>(node->Op()->GetAttr("use_mkldnn"));
-        if (use_mkldnn) {
-          if (node->Op()->HasAttr("fuse_relu")) {
-            bool fuse_relu = boost::get<bool>(node->Op()->GetAttr("fuse_relu"));
-            if (fuse_relu) {
-              ++conv_relu_count;
-            }
-          }
-        }
+      auto* op = node->Op();
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+      EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
+      ASSERT_TRUE(op->HasAttr("fuse_relu"));
+      bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu"));
+      if (fuse_relu) {
+        ++conv_relu_count;
      }
    }
  }

--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -77,10 +77,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
    const std::string BatchedCellPreAct =
        patterns::UniqueKey("BatchedCellPreAct");
    const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
+    const std::string CheckedCell = patterns::UniqueKey("CheckedCell");

    scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
    scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
    scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
+    scope->Var(CheckedCell)->GetMutable<framework::LoDTensor>();

    op_desc.SetInput("H0", {});
    op_desc.SetInput("C0", {});
@@ -90,6 +92,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
    op_desc.SetOutput("BatchedGate", {BatchedGate});
    op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
    op_desc.SetOutput("BatchedInput", {BatchedInput});
+    op_desc.SetOutput("CheckedCell", {CheckedCell});
    op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
    op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
    // TODO(TJ): get from attr

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include <algorithm>
+#include <deque>
 #include <unordered_set>

-#include "paddle/fluid/framework/ir/graph_helper.h"
-
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -113,6 +113,74 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
  return adj_list;
 }

+size_t GraphNum(const Graph &graph) {
+  std::unordered_set<ir::Node *> nodes = graph.Nodes();
+  std::unordered_set<ir::Node *> visited_nodes;
+  visited_nodes.reserve(nodes.size());
+  std::deque<ir::Node *> q_nodes;
+  std::vector<std::unordered_set<ir::Node *>> graph_nodes;
+  std::unordered_set<ir::Node *> g_nodes;
+  size_t graph_count = 0;
+
+  auto traverse_nodes = [&visited_nodes,
+                         &q_nodes](const std::vector<ir::Node *> &nodes) {
+    std::copy_if(
+        nodes.begin(), nodes.end(), std::back_inserter(q_nodes),
+        [&visited_nodes](Node *node) { return !visited_nodes.count(node); });
+  };
+
+  while (visited_nodes.size() != nodes.size()) {
+    if (!q_nodes.empty()) {
+      auto cur_node = q_nodes.front();
+      q_nodes.pop_front();
+      visited_nodes.insert(cur_node);
+      g_nodes.insert(cur_node);
+      traverse_nodes(cur_node->inputs);
+      traverse_nodes(cur_node->outputs);
+    } else {
+      ++graph_count;
+      if (g_nodes.size()) {
+        graph_nodes.emplace_back(g_nodes);
+      }
+      g_nodes.clear();
+      for (auto &n : nodes) {
+        if (visited_nodes.count(n) == 0) {
+          q_nodes.push_back(n);
+          break;
+        }
+      }
+    }
+  }
+
+  if (g_nodes.size()) {
+    graph_nodes.emplace_back(g_nodes);
+  }
+
+  if (VLOG_IS_ON(10)) {
+    VLOG(10) << "graph_num: " << graph_nodes.size();
+    for (auto &g_n : graph_nodes) {
+      VLOG(10) << "graph_nodes: " << g_n.size();
+      if (g_n.size() < 10) {
+        std::stringstream out;
+        for (auto &node : g_n) {
+          out << "\nNode: " << node->Name() << " in [";
+          for (auto &n : node->inputs) {
+            out << n->Name() << ", ";
+          }
+          out << "], out[";
+          for (auto &n : node->outputs) {
+            out << n->Name() << ", ";
+          }
+          out << "]";
+        }
+        VLOG(10) << out.str();
+      }
+    }
+  }
+
+  return graph_count;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -27,6 +27,8 @@ namespace ir {
 // Test if the graph contains circle.
 bool HasCircle(const Graph &graph);

+size_t GraphNum(const Graph &graph);
+
 // Topology Sort the operations in the graph from inputs to outputs.
 // `graph` cannot contain circle.
 std::vector<ir::Node *> TopologySortOperations(const Graph &graph);

--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -120,6 +120,97 @@ TEST(GraphHelperTest, Basic) {
  ASSERT_EQ(node_map.at("op2"), 1UL);
  ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
 }
+
+void BuildZeroGraph(Graph* g) {}
+
+void BuildOneGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+
+void BuildTwoGraphs(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  //  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  //  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  //  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  //  v4->outputs.push_back(o5);
+}
+
+TEST(GraphHelperTest, GraphNum) {
+  ProgramDesc prog;
+
+  Graph g(prog);
+  BuildZeroGraph(&g);
+  ASSERT_EQ(GraphNum(g), 0);
+
+  Graph g2(prog);
+  BuildOneGraph(&g2);
+  ASSERT_EQ(GraphNum(g2), 1);
+
+  Graph g3(prog);
+  BuildTwoGraphs(&g3);
+  ASSERT_EQ(GraphNum(g3), 2);
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -638,11 +638,6 @@ PDNode *patterns::ConvReLU::operator()(
                              ->AsInput()
                              ->assert_is_persistable_var()
                              ->assert_is_op_input("conv2d", "Filter");
-  // Bias
-  auto *conv_bias_var = pattern->NewNode(conv_bias_repr())
-                            ->AsInput()
-                            ->assert_is_persistable_var()
-                            ->assert_is_op_input("conv2d", "Bias");
  // intermediate variable, will be removed in the IR after fuse.
  auto *conv_out_var = pattern->NewNode(conv_out_repr())
                           ->AsIntermediate()
@@ -653,8 +648,7 @@ PDNode *patterns::ConvReLU::operator()(
                           ->AsOutput()
                           ->assert_is_op_output("relu");

-  conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var})
-      .LinksTo({conv_out_var});
+  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
  relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var});
  return relu_out_var;
 }

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -379,7 +379,7 @@ struct PatternBase {
 // op: conv + relu
 // named nodes:
 // conv_input, conv_weight,
-// conv_bias, conv_out, conv,
+// conv_out, conv,
 // relu_out, relu
 struct ConvReLU : public PatternBase {
  ConvReLU(PDPattern* pattern, const std::string& name_scope)
@@ -392,7 +392,6 @@ struct ConvReLU : public PatternBase {
  PATTERN_DECL_NODE(relu);
  // declare variable node's name
  PATTERN_DECL_NODE(conv_weight);
-  PATTERN_DECL_NODE(conv_bias);
  PATTERN_DECL_NODE(conv_out);
  PATTERN_DECL_NODE(relu_out);
 };

--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -14,6 +14,8 @@

 #include "paddle/fluid/framework/ir/graph_traits.h"

+#include <vector>
+
 namespace paddle {
 namespace framework {
 namespace ir {

--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -19,7 +19,6 @@ namespace paddle {
 namespace framework {
 namespace ir {
 std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
-  PADDLE_ENFORCE(!applied_, "Pass can only Apply() once.");
  PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
  for (const std::string& attr : required_pass_attrs_) {
    PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),

--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -42,6 +42,8 @@ class Pass {
    attr_dels_.clear();
  }

+  std::string Type() const { return type_; }
+
  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const;

  // Get a reference to the attributed previously set.
@@ -52,6 +54,21 @@ class Pass {
    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
  }

+  bool Has(const std::string &attr_name) const {
+    return attrs_.find(attr_name) != attrs_.end();
+  }
+
+  void Erase(const std::string &attr_name) {
+    if (!Has(attr_name)) {
+      return;
+    }
+    if (attr_dels_.find(attr_name) != attr_dels_.end()) {
+      attr_dels_[attr_name]();
+      attr_dels_.erase(attr_name);
+    }
+    attrs_.erase(attr_name);
+  }
+
  // Set a pointer to the attribute. Pass takes ownership of the attribute.
  template <typename AttrType>
  void Set(const std::string &attr_name, AttrType *attr) {
@@ -68,13 +85,15 @@ class Pass {
  // should delete the attribute.
  template <typename AttrType>
  void SetNotOwned(const std::string &attr_name, AttrType *attr) {
-    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
+                   attr_name);
    attrs_[attr_name] = attr;
  }

 protected:
-  virtual std::unique_ptr<Graph> ApplyImpl(
-      std::unique_ptr<Graph> graph) const = 0;
+  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
+    LOG(FATAL) << "Calling virtual Pass not implemented.";
+  }

 private:
  template <typename PassType>
@@ -89,7 +108,10 @@ class Pass {
    required_graph_attrs_.insert(attrs.begin(), attrs.end());
  }

+  void RegisterType(const std::string &type) { type_ = type; }
+
  mutable bool applied_{false};
+  std::string type_;
  std::unordered_set<std::string> required_pass_attrs_;
  std::unordered_set<std::string> required_graph_attrs_;
  std::map<std::string, boost::any> attrs_;
@@ -143,10 +165,11 @@ struct PassRegistrar : public Registrar {
    PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
                   "'%s' is registered more than once.", pass_type);
    PassRegistry::Instance().Insert(
-        pass_type, [this]() -> std::unique_ptr<Pass> {
+        pass_type, [this, pass_type]() -> std::unique_ptr<Pass> {
          std::unique_ptr<Pass> pass(new PassType());
          pass->RegisterRequiredPassAttrs(this->required_pass_attrs_);
          pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_);
+          pass->RegisterType(pass_type);
          return pass;
        });
  }

--- a/paddle/fluid/framework/ir/pass_builder.cc
+++ b/paddle/fluid/framework/ir/pass_builder.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/pass_builder.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::shared_ptr<Pass> PassBuilder::AppendPass(const std::string& pass_type) {
+  auto pass = ir::PassRegistry::Instance().Get(pass_type);
+  passes_.emplace_back(pass.release());
+  return passes_.back();
+}
+
+void PassBuilder::RemovePass(size_t idx) {
+  PADDLE_ENFORCE(passes_.size() > idx);
+  passes_.erase(passes_.begin() + idx);
+}
+
+std::shared_ptr<Pass> PassBuilder::InsertPass(size_t idx,
+                                              const std::string& pass_type) {
+  PADDLE_ENFORCE(passes_.size() >= idx);
+  std::shared_ptr<Pass> pass(
+      ir::PassRegistry::Instance().Get(pass_type).release());
+  passes_.insert(passes_.begin() + idx, std::move(pass));
+  return passes_[idx];
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/pass_builder.h
+++ b/paddle/fluid/framework/ir/pass_builder.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class PassBuilder {
+ public:
+  PassBuilder() {}
+
+  virtual ~PassBuilder() {}
+
+  // Append a new pass to the end.
+  std::shared_ptr<Pass> AppendPass(const std::string& pass_type);
+
+  // Insert a new pass after `idx`.
+  std::shared_ptr<Pass> InsertPass(size_t idx, const std::string& pass_type);
+
+  // Remove a new pass at `idx`.
+  void RemovePass(size_t idx);
+
+  // Returns a list of all passes.
+  std::vector<std::shared_ptr<Pass>> AllPasses() const { return passes_; }
+
+ protected:
+  std::vector<std::shared_ptr<Pass>> passes_;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -82,12 +82,10 @@ TEST(PassTest, TestPassAttrCheck) {
  ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
  ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);

-  try {
-    graph = pass->Apply(std::move(graph));
-  } catch (paddle::platform::EnforceNotMet e) {
-    exception = std::string(e.what());
-  }
-  ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos);
+  // Allow apply more than once.
+  graph.reset(new Graph(prog));
+  graph->Set<int>("test_graph_attr", new int);
+  graph = pass->Apply(std::move(graph));

  pass = PassRegistry::Instance().Get("test_pass");
  pass->SetNotOwned<int>("test_pass_attr", &val);

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -17,10 +17,13 @@
 #include <algorithm>
 #include <initializer_list>
 #include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
 #include <vector>
-
+#include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/memcpy.h"

 #include "glog/logging.h"

@@ -28,206 +31,436 @@ namespace paddle {
 namespace framework {

 #if defined(PADDLE_WITH_CUDA)
+namespace details {
+struct CUDABuffer {
+  void *data_{nullptr};
+  size_t size_{0};
+  platform::CUDAPlace place_;
+
+  CUDABuffer() {}
+  CUDABuffer(platform::Place place, size_t size)
+      : size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
+    data_ = memory::Alloc(place_, size);
+  }
+
+  ~CUDABuffer() { ClearMemory(); }
+
+  CUDABuffer(const CUDABuffer &o) = delete;
+  CUDABuffer &operator=(const CUDABuffer &o) = delete;
+
+  void Resize(platform::Place place, size_t size) {
+    ClearMemory();
+    place_ = boost::get<platform::CUDAPlace>(place);
+    data_ = memory::Alloc(place_, size);
+    PADDLE_ENFORCE_NOT_NULL(data_);
+    size_ = size;
+  }
+
+  void Swap(CUDABuffer &o) {
+    std::swap(data_, o.data_);
+    std::swap(place_, o.place_);
+    std::swap(size_, o.size_);
+  }
+
+ private:
+  void ClearMemory() const {
+    if (data_ != nullptr) {
+      memory::Free(place_, data_);
+    }
+  }
+};
+}  // namespace details
+
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
 class Vector {
 public:
  using value_type = T;
+  using iterator = typename std::vector<T>::iterator;
+  using const_iterator = typename std::vector<T>::const_iterator;

-  // Default ctor. Create empty Vector
-  Vector() { InitEmpty(); }
+ private:
+  // The actual class to implement vector logic
+  class VectorData {
+   public:
+    VectorData() : flag_(kDataInCPU) {}
+    VectorData(size_t count, const T &value)
+        : cpu_(count, value), flag_(kDataInCPU) {}
+    VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
+    template <typename U>
+    explicit VectorData(const std::vector<U> &dat)
+        : cpu_(dat), flag_(kDataInCPU) {}
+    ~VectorData() {}
+
+    VectorData(const VectorData &o) {
+      o.ImmutableCPU();
+      cpu_ = o.cpu_;
+      flag_ = kDataInCPU;
+    }

-  // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T &value = T()) {
-    InitEmpty();
-    if (count != 0) {
-      resize(count);
-      T *ptr = begin();
-      for (size_t i = 0; i < count; ++i) {
-        ptr[i] = value;
+    VectorData &operator=(const VectorData &o) {
+      o.ImmutableCPU();
+      cpu_ = o.cpu_;
+      flag_ = kDataInCPU;
+      details::CUDABuffer null;
+      gpu_.Swap(null);
+      return *this;
+    }
+
+    T &operator[](size_t i) {
+      MutableCPU();
+      return cpu_[i];
+    }
+
+    const T &operator[](size_t i) const {
+      ImmutableCPU();
+      return cpu_[i];
+    }
+
+    size_t size() const { return cpu_.size(); }
+
+    iterator begin() {
+      MutableCPU();
+      return cpu_.begin();
+    }
+
+    iterator end() {
+      MutableCPU();
+      return cpu_.end();
+    }
+
+    T &front() {
+      MutableCPU();
+      return cpu_.front();
+    }
+
+    T &back() {
+      MutableCPU();
+      return cpu_.back();
+    }
+
+    const_iterator begin() const {
+      ImmutableCPU();
+      return cpu_.begin();
+    }
+
+    const_iterator end() const {
+      ImmutableCPU();
+      return cpu_.end();
+    }
+
+    const T &back() const {
+      ImmutableCPU();
+      return cpu_.back();
+    }
+
+    T *data() { return &(*this)[0]; }
+
+    const T *data() const { return &(*this)[0]; }
+
+    const T &front() const {
+      ImmutableCPU();
+      return cpu_.front();
+    }
+
+    // assign this from iterator.
+    // NOTE: the iterator must support `end-begin`
+    template <typename Iter>
+    void assign(Iter begin, Iter end) {
+      MutableCPU();
+      cpu_.assign(begin, end);
+    }
+
+    // push_back. If the previous capacity is not enough, the memory will
+    // double.
+    void push_back(T elem) {
+      MutableCPU();
+      cpu_.push_back(elem);
+    }
+
+    // extend a vector by iterator.
+    // NOTE: the iterator must support end-begin
+    template <typename It>
+    void Extend(It begin, It end) {
+      MutableCPU();
+      auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
+      std::copy(begin, end, out_it);
+    }
+
+    // resize the vector
+    void resize(size_t size) {
+      MutableCPU();
+      cpu_.resize(size);
+    }
+
+    // get cuda ptr. immutable
+    const T *CUDAData(platform::Place place) const {
+      PADDLE_ENFORCE(platform::is_gpu_place(place),
+                     "CUDA Data must on CUDA place");
+      ImmutableCUDA(place);
+      return reinterpret_cast<T *>(gpu_.data_);
+    }
+
+    // get cuda ptr. mutable
+    T *CUDAMutableData(platform::Place place) {
+      const T *ptr = CUDAData(place);
+      flag_ = kDirty | kDataInCUDA;
+      return const_cast<T *>(ptr);
+    }
+
+    // clear
+    void clear() {
+      cpu_.clear();
+      flag_ = kDirty | kDataInCPU;
+    }
+
+    size_t capacity() const { return cpu_.capacity(); }
+
+    // reserve data
+    void reserve(size_t size) const { cpu_.reserve(size); }
+
+    // implicit cast operator. Vector can be cast to std::vector implicitly.
+    operator std::vector<T>() const {
+      ImmutableCPU();
+      return cpu_;
+    }
+
+    bool operator==(const VectorData &other) const {
+      ImmutableCPU();
+      other.ImmutableCPU();
+      return cpu_ == other.cpu_;
+    }
+
+    std::mutex &Mutex() const { return mtx_; }
+
+    std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
+      if (gpu_.data_ == nullptr) {
+        return nullptr;
+      } else {
+        return std::unique_ptr<platform::CUDAPlace>(
+            new platform::CUDAPlace(gpu_.place_));
      }
    }
-  }

-  // Ctor with init_list
-  Vector(std::initializer_list<T> init) {
-    if (init.size() == 0) {
-      InitEmpty();
-    } else {
-      InitByIter(init.size(), init.begin(), init.end());
+   private:
+    enum DataFlag {
+      kDataInCPU = 0x01,
+      kDataInCUDA = 0x02,
+      // kDirty means the data has been changed in one device.
+      kDirty = 0x10
+    };
+
+    void CopyToCPU() const {
+      // COPY GPU Data To CPU
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(
+              platform::Place(gpu_.place_)));
+      auto stream = dev_ctx->stream();
+      void *src = gpu_.data_;
+      void *dst = cpu_.data();
+      memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
+                   stream);
+      dev_ctx->Wait();
+    }
+
+    void MutableCPU() {
+      if (IsInCUDA() && IsDirty()) {
+        CopyToCPU();
+      }
+      flag_ = kDirty | kDataInCPU;
    }
-  }
+
+    void ImmutableCUDA(platform::Place place) const {
+      if (IsDirty()) {
+        if (IsInCPU()) {
+          CopyCPUDataToCUDA(place);
+          UnsetFlag(kDirty);
+          SetFlag(kDataInCUDA);
+        } else if (IsInCUDA() &&
+                   !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
+          PADDLE_THROW("This situation should not happen");
+          // Still dirty
+        } else {
+          // Dirty && DataInCUDA && Device is same
+          // Do nothing
+        }
+      } else {
+        if (!IsInCUDA()) {
+          // Even data is not dirty. However, data is not in CUDA. Copy data.
+          CopyCPUDataToCUDA(place);
+          SetFlag(kDataInCUDA);
+        } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
+          PADDLE_THROW("This situation should not happen.");
+        } else {
+          // Not Dirty && DataInCUDA && Device is same
+          // Do nothing.
+        }
+      }
+    }
+
+    void CopyCPUDataToCUDA(const platform::Place &place) const {
+      void *src = cpu_.data();
+      gpu_.Resize(place, cpu_.size() * sizeof(T));
+      void *dst = gpu_.data_;
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(place));
+      auto stream = dev_ctx->stream();
+      memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
+                   stream);
+    }
+
+    void ImmutableCPU() const {
+      if (IsDirty() && !IsInCPU()) {  // If data has been changed in CUDA, or
+                                      // CPU has no data.
+        CopyToCPU();
+        UnsetFlag(kDirty);
+      }
+      SetFlag(kDataInCPU);
+    }
+
+    void UnsetFlag(int flag) const { flag_ &= ~flag; }
+    void SetFlag(int flag) const { flag_ |= flag; }
+
+    bool IsDirty() const { return flag_ & kDirty; }
+
+    bool IsInCUDA() const { return flag_ & kDataInCUDA; }
+
+    bool IsInCPU() const { return flag_ & kDataInCPU; }
+
+    mutable std::vector<T> cpu_;
+    mutable details::CUDABuffer gpu_;
+    mutable int flag_;
+
+    mutable std::mutex mtx_;
+  };
+
+ public:
+  // Default ctor. Create empty Vector
+  Vector() : m_(new VectorData()) {}
+
+  // Fill vector with value. The vector size is `count`.
+  explicit Vector(size_t count, const T &value = T())
+      : m_(new VectorData(count, value)) {}
+
+  // Ctor with init_list
+  Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}

  // implicit cast from std::vector.
  template <typename U>
-  Vector(const std::vector<U> &dat) {  // NOLINT
-    if (dat.size() == 0) {
-      InitEmpty();
-    } else {
-      InitByIter(dat.size(), dat.begin(), dat.end());
-    }
+  Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) {  // NOLINT
  }

  // Copy ctor
-  Vector(const Vector<T> &other) { this->operator=(other); }
+  Vector(const Vector<T> &other) { m_ = other.m_; }

  // Copy operator
  Vector<T> &operator=(const Vector<T> &other) {
-    if (other.size() != 0) {
-      this->InitByIter(other.size(), other.begin(), other.end());
-    } else {
-      InitEmpty();
-    }
+    m_ = other.m_;
    return *this;
  }

  // Move ctor
-  Vector(Vector<T> &&other) {
-    this->size_ = other.size_;
-    this->flag_ = other.flag_;
-    if (other.cuda_vec_.memory_size()) {
-      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
-    }
-    if (other.cpu_vec_.memory_size()) {
-      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
-    }
-  }
+  Vector(Vector<T> &&other) { m_ = std::move(other.m_); }

  // CPU data access method. Mutable.
-  T &operator[](size_t i) {
-    MutableCPU();
-    return const_cast<T *>(cpu_vec_.data<T>())[i];
-  }
+  T &operator[](size_t i) { return (*m_.MutableData())[i]; }

  // CPU data access method. Immutable.
-  const T &operator[](size_t i) const {
-    ImmutableCPU();
-    return cpu_vec_.data<T>()[i];
-  }
+  const T &operator[](size_t i) const { return m_.Data()[i]; }

  // std::vector iterator methods. Based on CPU data access method
-  size_t size() const { return size_; }
+  size_t size() const { return m_.Data().size(); }

-  T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+  iterator begin() { return m_.MutableData()->begin(); }

-  T *end() {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
-  }
+  iterator end() { return m_.MutableData()->end(); }

-  T &front() { return *begin(); }
+  T &front() { return m_.MutableData()->front(); }

-  T &back() {
-    auto it = end();
-    --it;
-    return *it;
-  }
+  T &back() { return m_.MutableData()->back(); }

-  const T *begin() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
-  }
+  const_iterator begin() const { return m_.Data().begin(); }

-  const T *end() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
-  }
+  const_iterator end() const { return m_.Data().end(); }

-  const T *cbegin() const { return begin(); }
+  const_iterator cbegin() const { return begin(); }

-  const T *cend() const { return end(); }
+  const_iterator cend() const { return end(); }

-  const T &back() const {
-    auto it = end();
-    --it;
-    return *it;
-  }
+  const T &back() const { return m_.Data().back(); }

-  T *data() { return begin(); }
+  T *data() { return m_.MutableData()->data(); }

-  const T *data() const { return begin(); }
+  const T *data() const { return m_.Data().data(); }

-  const T &front() const { return *begin(); }
+  const T &front() const { return m_.Data().front(); }
  // end of std::vector iterator methods

  // assign this from iterator.
  // NOTE: the iterator must support `end-begin`
  template <typename Iter>
  void assign(Iter begin, Iter end) {
-    InitByIter(end - begin, begin, end);
+    m_.MutableData()->assign(begin, end);
  }

  // push_back. If the previous capacity is not enough, the memory will
  // double.
-  void push_back(T elem) {
-    if (size_ + 1 > capacity()) {
-      reserve((size_ + 1) << 1);
-    }
-    *end() = elem;
-    ++size_;
-  }
+  void push_back(T elem) { m_.MutableData()->push_back(elem); }

  // extend a vector by iterator.
  // NOTE: the iterator must support end-begin
  template <typename It>
  void Extend(It begin, It end) {
-    size_t pre_size = size_;
-    resize(pre_size + (end - begin));
-    T *ptr = this->begin() + pre_size;
-    for (; begin < end; ++begin, ++ptr) {
-      *ptr = *begin;
-    }
+    m_.MutableData()->Extend(begin, end);
  }

  // resize the vector
  void resize(size_t size) {
-    if (size + 1 <= capacity()) {
-      size_ = size;
-    } else {
-      MutableCPU();
-      Tensor cpu_tensor;
-      platform::Place cpu = platform::CPUPlace();
-      T *ptr = cpu_tensor.mutable_data<T>(
-          framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-      const T *old_ptr =
-          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
-      if (old_ptr != nullptr) {
-        std::copy(old_ptr, old_ptr + size_, ptr);
-      }
-      size_ = size;
-      cpu_vec_.ShareDataWith(cpu_tensor);
+    if (m_.Data().size() != size) {
+      m_.MutableData()->resize(size);
    }
  }

  // get cuda ptr. immutable
  const T *CUDAData(platform::Place place) const {
-    PADDLE_ENFORCE(platform::is_gpu_place(place),
-                   "CUDA Data must on CUDA place");
-    ImmutableCUDA(place);
-    return cuda_vec_.data<T>();
+    {
+      auto &mtx = m_.Data().Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_.Data().CUDAPlace();
+      if (cuda_place == nullptr ||
+          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+        return m_.Data().CUDAData(place);
+      }
+    }
+    // If m_ contains CUDAData in a different place. Detach manually.
+    m_.Detach();
+    return CUDAData(place);
  }

  // get cuda ptr. mutable
  T *CUDAMutableData(platform::Place place) {
-    const T *ptr = CUDAData(place);
-    flag_ = kDirty | kDataInCUDA;
-    return const_cast<T *>(ptr);
+    {
+      auto &mtx = m_.Data().Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_.Data().CUDAPlace();
+      if (cuda_place == nullptr ||
+          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+        return m_.MutableData()->CUDAMutableData(place);
+      }
+    }
+    // If m_ contains CUDAData in a different place. Detach manually.
+    m_.Detach();
+    return CUDAMutableData(place);
  }

  // clear
-  void clear() {
-    size_ = 0;
-    flag_ = kDirty | kDataInCPU;
-  }
+  void clear() { m_.MutableData()->clear(); }

-  size_t capacity() const {
-    return cpu_vec_.memory_size() / SizeOfType(typeid(T));
-  }
+  size_t capacity() const { return m_.Data().capacity(); }

  // reserve data
-  void reserve(size_t size) {
-    size_t pre_size = size_;
-    resize(size);
-    resize(pre_size);
-  }
+  void reserve(size_t size) { m_.Data().reserve(size); }

  // the unify method to access CPU or CUDA data. immutable.
  const T *Data(platform::Place place) const {
@@ -248,12 +481,7 @@ class Vector {
  }

  // implicit cast operator. Vector can be cast to std::vector implicitly.
-  operator std::vector<T>() const {
-    std::vector<T> result;
-    result.resize(size());
-    std::copy(begin(), end(), result.begin());
-    return result;
-  }
+  operator std::vector<T>() const { return m_.Data(); }

  bool operator==(const Vector<T> &other) const {
    if (size() != other.size()) return false;
@@ -267,118 +495,11 @@ class Vector {
    return true;
  }

- private:
-  void InitEmpty() {
-    size_ = 0;
-    flag_ = kDataInCPU;
-  }
-
-  template <typename Iter>
-  void InitByIter(size_t size, Iter begin, Iter end) {
-    platform::Place cpu = platform::CPUPlace();
-    T *ptr = this->cpu_vec_.template mutable_data<T>(
-        framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-    for (size_t i = 0; i < size; ++i) {
-      *ptr++ = *begin++;
-    }
-    flag_ = kDataInCPU | kDirty;
-    size_ = size;
-  }
-
-  enum DataFlag {
-    kDataInCPU = 0x01,
-    kDataInCUDA = 0x02,
-    // kDirty means the data has been changed in one device.
-    kDirty = 0x10
-  };
-
-  void CopyToCPU() const {
-    // COPY GPU Data To CPU
-    TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
-    WaitPlace(cuda_vec_.place());
-  }
-
-  void MutableCPU() {
-    if (IsInCUDA() && IsDirty()) {
-      CopyToCPU();
-    }
-    flag_ = kDirty | kDataInCPU;
-  }
-
-  void ImmutableCUDA(platform::Place place) const {
-    if (IsDirty()) {
-      if (IsInCPU()) {
-        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
-                   &cuda_vec_);
-        WaitPlace(place);
-        UnsetFlag(kDirty);
-        SetFlag(kDataInCUDA);
-      } else if (IsInCUDA() && !(place == cuda_vec_.place())) {
-        framework::Tensor tmp;
-        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
-        WaitPlace(cuda_vec_.place());
-        cuda_vec_.ShareDataWith(tmp);
-        // Still dirty
-      } else {
-        // Dirty && DataInCUDA && Device is same
-        // Do nothing
-      }
-    } else {
-      if (!IsInCUDA()) {
-        // Even data is not dirty. However, data is not in CUDA. Copy data.
-        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
-                   &cuda_vec_);
-        WaitPlace(place);
-        SetFlag(kDataInCUDA);
-      } else if (!(place == cuda_vec_.place())) {
-        framework::Tensor tmp;
-        WaitPlace(cuda_vec_.place());
-        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
-        WaitPlace(cuda_vec_.place());
-        WaitPlace(place);
-        cuda_vec_.ShareDataWith(tmp);
-      } else {
-        // Not Dirty && DataInCUDA && Device is same
-        // Do nothing.
-      }
-    }
-  }
-
-  void ImmutableCPU() const {
-    if (IsDirty() &&
-        !IsInCPU()) {  // If data has been changed in CUDA, or CPU has no data.
-      CopyToCPU();
-      UnsetFlag(kDirty);
-    }
-    SetFlag(kDataInCPU);
-  }
-
-  void UnsetFlag(int flag) const { flag_ &= ~flag; }
-  void SetFlag(int flag) const { flag_ |= flag; }
+  const void *Handle() const { return &m_.Data(); }

-  bool IsDirty() const { return flag_ & kDirty; }
-
-  bool IsInCUDA() const { return flag_ & kDataInCUDA; }
-
-  bool IsInCPU() const { return flag_ & kDataInCPU; }
-
-  static void WaitPlace(const platform::Place place) {
-    if (platform::is_gpu_place(place)) {
-      platform::DeviceContextPool::Instance()
-          .Get(boost::get<platform::CUDAPlace>(place))
-          ->Wait();
-    }
-  }
-
-  static T &EmptyDummy() {
-    static T dummy = T();
-    return dummy;
-  }
-
-  mutable int flag_;
-  mutable Tensor cpu_vec_;
-  mutable Tensor cuda_vec_;
-  size_t size_;
+ private:
+  // Vector is an COW object.
+  mutable details::COWPtr<VectorData> m_;
 };

 #else  // PADDLE_WITH_CUDA

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,21 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/parallel_executor.h"
-
 #include <string>
 #include <tuple>
 #include <vector>
+#include "paddle/fluid/framework/ir/graph_helper.h"

 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"

 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -35,80 +33,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
-    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &param_names,
-    const std::vector<Scope *> &local_scopes, const bool use_cuda,
-#ifdef PADDLE_WITH_CUDA
-    const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
-#else
-    const BuildStrategy &strategy) {
-#endif
-  // Convert the program to graph.
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
-
-  // Apply a graph viz pass to record a graph.
-  if (!strategy.debug_graphviz_path_.empty()) {
-    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
-    const std::string graph_path = string::Sprintf(
-        "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
-    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
-    graph = viz_pass->Apply(std::move(graph));
-  }
-
-  // Apply op fusion.
-  if (strategy.fuse_elewise_add_act_ops_) {
-    auto fuse_elewise_add_act_pass =
-        ir::PassRegistry::Instance().Get("fuse_elewise_add_act_pass");
-    graph = fuse_elewise_add_act_pass->Apply(std::move(graph));
-    // Apply a graph viz pass to record a graph.
-    if (!strategy.debug_graphviz_path_.empty()) {
-      auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
-      const std::string graph_path = string::Sprintf(
-          "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
-      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
-      graph = viz_pass->Apply(std::move(graph));
-    }
-  }
-
-  // Convert graph to run on multi-devices.
-  auto multi_devices_pass =
-      ir::PassRegistry::Instance().Get("multi_devices_pass");
-  multi_devices_pass->SetNotOwned<const std::vector<platform::Place>>("places",
-                                                                      &places);
-  multi_devices_pass->SetNotOwned<const std::string>("loss_var_name",
-                                                     &loss_var_name);
-  multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
-      "params", &param_names);
-  multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
-                                                              &local_scopes);
-  multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
-
-#ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
-  multi_devices_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
-#endif
-  graph = multi_devices_pass->Apply(std::move(graph));
-
-  // Apply a graph print pass to record a graph with device info.
-  if (!strategy.debug_graphviz_path_.empty()) {
-    auto multi_devices_print_pass =
-        ir::PassRegistry::Instance().Get("multi_devices_print_pass");
-    multi_devices_print_pass->SetNotOwned<const std::string>(
-        "debug_graphviz_path", &strategy.debug_graphviz_path_);
-    multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
-        "graph_printer", new details::GraphvizSSAGraphPrinter);
-    graph = multi_devices_print_pass->Apply(std::move(graph));
-  }
-
-  // Verify that the graph is correct for multi-device executor.
-  auto multi_devices_check_pass =
-      ir::PassRegistry::Instance().Get("multi_devices_check_pass");
-  graph = multi_devices_check_pass->Apply(std::move(graph));
-  return graph;
-}
-
 class ParallelExecutorPrivate {
 public:
  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
@@ -199,10 +123,9 @@ ParallelExecutor::ParallelExecutor(
 // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
 // ncclOp
 #ifdef PADDLE_WITH_CUDA
-  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
      main_program, member_->places_, loss_var_name, params,
-      member_->local_scopes_, member_->use_cuda_, build_strategy,
-      member_->nccl_ctxs_.get());
+      member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());

  auto max_memory_size = GetEagerDeletionThreshold();
  if (max_memory_size >= 0) {
@@ -228,11 +151,17 @@ ParallelExecutor::ParallelExecutor(
    }
  }
 #else
-  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
-      main_program, member_->places_, loss_var_name, params,
-      member_->local_scopes_, member_->use_cuda_, build_strategy);
+  std::unique_ptr<ir::Graph> graph =
+      build_strategy.Apply(main_program, member_->places_, loss_var_name,
+                           params, member_->local_scopes_, member_->use_cuda_);
 #endif

+  // If the loss_var_name is given, the number of graph should be only one.
+  if (loss_var_name.size()) {
+    PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
+                      "The number of graph should be only one");
+  }
+
  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
    member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
        exec_strategy, member_->local_scopes_, places, std::move(graph)));
@@ -373,12 +302,6 @@ ParallelExecutor::~ParallelExecutor() {

 }  // namespace framework
 }  // namespace paddle
-
-USE_PASS(fuse_elewise_add_act_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(multi_devices_pass);
-USE_PASS(multi_devices_check_pass);
-USE_PASS(multi_devices_print_pass);
 #ifdef PADDLE_WITH_CUDA
 USE_PASS(reference_count_pass);
 #endif
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,14 +14,14 @@ limitations under the License. */

 #pragma once

-#include <paddle/fluid/framework/details/build_strategy.h>
 #include <atomic>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
+#include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"

--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test {
    selected_rows_.reset(new SelectedRows(rows, height));

    Tensor* value = selected_rows_->mutable_value();
-    value->mutable_data<float>(
+    auto* data = value->mutable_data<float>(
        make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
+    for (int64_t i = 0; i < value->numel(); ++i) {
+      data[i] = static_cast<float>(i);
+    }
  }

 protected:
@@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
  ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
  ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
+  auto* dst_data = dst_tensor.value().data<float>();
+  for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) {
+    ASSERT_EQ(dst_data[i], static_cast<float>(i));
+  }
 }

 TEST(SelectedRows, SparseTable) {

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -37,12 +37,16 @@ TEST(Analyzer, analysis_without_tensorrt) {
 TEST(Analyzer, analysis_with_tensorrt) {
  FLAGS_IA_enable_tensorrt_subgraph_engine = true;
  Argument argument;
+  argument.Set<int>("minimum_subgraph_size", new int(0));
+  argument.Set<int>("max_batch_size", new int(3));
+  argument.Set<int>("workspace_size", new int(1 << 20));
+  argument.Set<std::string>("precision_mode", new std::string("FP32"));
  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
  Analyzer analyser;
  analyser.Run(&argument);
 }

-void TestWord2vecPrediction(const std::string &model_path) {
+void TestWord2vecPrediction(const std::string& model_path) {
  NativeConfig config;
  config.model_dir = model_path;
  config.use_gpu = false;
@@ -73,8 +77,8 @@ void TestWord2vecPrediction(const std::string &model_path) {
  // The outputs' buffers are in CPU memory.
  for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
    LOG(INFO) << "data: "
-              << static_cast<float *>(outputs.front().data.data())[i];
-    PADDLE_ENFORCE(static_cast<float *>(outputs.front().data.data())[i],
+              << static_cast<float*>(outputs.front().data.data())[i];
+    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
                   result[i]);
  }
 }

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -97,8 +97,10 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
  }
 }

-void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
+void CreateTrtEngineOp(Node *node, Argument *argument,
                       framework::proto::BlockDesc *block) {
+  PADDLE_ENFORCE(argument->main_dfg.get());
+  const DataFlowGraph &graph = *(argument->main_dfg);
  static int counter{0};
  PADDLE_ENFORCE(node->IsFunctionBlock());
  framework::OpDesc desc;
@@ -204,7 +206,10 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,

  PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
  // Set attrs
+
  SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
+  SetAttr(desc.Proto(), "max_batch_size", argument->Get<int>("max_batch_size"));
+  SetAttr(desc.Proto(), "workspace_size", argument->Get<int>("workspace_size"));
  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
  SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
@@ -248,7 +253,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
  *block_desc.Proto()->mutable_vars() =
      argument_->origin_program_desc->blocks(0).vars();
  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
-  CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
+  CreateTrtEngineOp(node, argument_, block_desc.Proto());
  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
  auto *op = main_block->add_ops();
  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");

--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -309,6 +309,8 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
 void SubGraphFuse::ReplaceNodesWithSubGraphs() {
  auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
  for (auto &subgraph : subgraphs) {
+    if (subgraph.size() <= argument_->Get<int>("minimum_subgraph_size"))
+      continue;
    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
    // replace this sub-graph with the first node. Two steps: 1. Create a Block
    // Node that contains this subgraph 2. Mark the nodes inside the sub-graph

--- a/paddle/fluid/inference/analysis/subgraph_splitter.h
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
@@ -20,6 +20,7 @@ limitations under the License. */

 #include <vector>

+#include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/node.h"

@@ -63,8 +64,11 @@ class SubGraphFuse {
 public:
  using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;

-  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
-      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller,
+               Argument *argument)
+      : graph_(graph),
+        node_inside_subgraph_teller_(teller),
+        argument_(argument) {}

  // The main method which run all the logic.
  void operator()();
@@ -76,6 +80,7 @@ class SubGraphFuse {
 private:
  DataFlowGraph *graph_;
  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+  Argument *argument_;
 };

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -66,10 +66,12 @@ TEST(SubGraphSplitter, Split) {
 TEST(SubGraphSplitter, Fuse) {
  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
  auto dfg = ProgramDescToDFG(desc);
+  Argument argument;
+  argument.Set<int>("minimum_subgraph_size", new int(3));

  size_t count0 = dfg.nodes.size();

-  SubGraphFuse fuse(&dfg, teller);
+  SubGraphFuse fuse(&dfg, teller, &argument);
  fuse();

  int count1 = 0;

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
    : node_inside_subgraph_teller_(teller) {}

 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
-  SubGraphFuse(graph, node_inside_subgraph_teller_)();
+  SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)();
  VLOG(4) << "debug info "
          << graph->HumanReadableInfo(false /*show_values*/,
                                      true /*show_functions*/);

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -33,7 +33,10 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {

  explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);

-  bool Initialize(Argument* argument) override { return true; }
+  bool Initialize(Argument* argument) override {
+    argument_ = argument;
+    return true;
+  }

  // This class get a sub-graph as input and determine whether to transform this
  // sub-graph into TensorRT.
@@ -46,6 +49,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {

 private:
  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+  Argument* argument_;
 };

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -36,6 +36,10 @@ TEST(TensorRTSubGraphPass, main) {
  };

  Argument argument(FLAGS_inference_model_dir);
+  argument.Set<int>("minimum_subgraph_size", new int(0));
+  argument.Set<int>("max_batch_size", new int(3));
+  argument.Set<int>("workspace_size", new int(1 << 20));
+  argument.Set<std::string>("precision_mode", new std::string("FP32"));

  DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
  DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -76,10 +76,10 @@ bool AnalysisPredictor::Init(
  }

  OptimizeInferenceProgram();
-  ctx_ = executor_->Prepare(*inference_program_, 0);
  if (config_._use_mkldnn) {
    executor_->EnableMKLDNN(*inference_program_);
  }
+  ctx_ = executor_->Prepare(*inference_program_, 0);

  VLOG(5) << "to create variables";
  PADDLE_ENFORCE(scope_.get());

--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -35,8 +35,6 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
    FLAGS_IA_enable_tensorrt_subgraph_engine = true;
    VLOG(3) << "Predictor::init()";
-    FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
-    FLAGS_tensorrt_workspace_size = config_.workspace_size;
    if (config_.use_gpu) {
      place_ = paddle::platform::CUDAPlace(config_.device);
    } else {
@@ -92,6 +90,14 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
  void OptimizeInferenceProgram() {
    // Analyze inference_program
    Argument argument;
+
+    argument.Set<int>("minimum_subgraph_size",
+                      new int(config_.minimum_subgraph_size));
+    argument.Set<int>("max_batch_size", new int(config_.max_batch_size));
+    argument.Set<int>("workspace_size", new int(config_.workspace_size));
+    argument.Set<std::string>("precision_mode",
+                              new std::string(config_.precision_mode));
+
    if (!config_.model_dir.empty()) {
      argument.fluid_model_dir.reset(new std::string(config_.model_dir));
    } else {

--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -194,6 +194,14 @@ struct MixedRTConfig : public NativeConfig {
  // For workspace_size, refer it from here:
  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
  int workspace_size{1 << 30};
+  //  We transform the Ops that can be converted into TRT layer in the model,
+  //  and aggregate these Ops into subgraphs for TRT execution.
+  //  We set this variable to control the minimum number of nodes in the
+  //  subgraph, 3 as default value.
+  int minimum_subgraph_size = 3;
+  // Reserved configuration
+  // We just support "FP32" now, "FP16" and "INT8" will be supported.
+  std::string precision_mode = "FP32";
 };

 // NOTE WIP, not stable yet.
@@ -204,10 +212,11 @@ struct AnalysisConfig : public NativeConfig {
    kExclude   // Specify the disabled passes in `ir_passes`.
  };

+  // Determine whether to perform graph optimization.
  bool enable_ir_optim = true;
+  // Manually determine the IR passes to run.
  IrPassMode ir_mode{IrPassMode::kExclude};
-  // attention lstm fuse works only on some specific models, disable as default.
-  std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
+  std::vector<std::string> ir_passes;

  // NOTE this is just for internal development, please not use it.
  bool _use_mkldnn{false};

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -58,6 +58,11 @@ set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classifi
 download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc)

+# seq_conv1
+set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1")
+download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc)
+
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
@@ -85,3 +90,13 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
               DEPS inference_anakin_api_shared dynload_cuda SERIAL)
   endif()
 endif()
+
+if(WITH_GPU AND TENSORRT_FOUND)
+   set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
+   if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
+       inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
+   endif()
+   cc_test(test_trt_models SRCS trt_models_tester.cc  
+     ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models
+     DEPS paddle_inference_tensorrt_subgraph_engine)
+endif()
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+struct DataRecord {
+  std::vector<std::vector<int64_t>> title1_all, title2_all, title3_all, l1_all;
+  std::vector<std::vector<int64_t>> title1, title2, title3, l1;
+  std::vector<size_t> title1_lod, title2_lod, title3_lod, l1_lod;
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  size_t num_samples;  // total number of samples
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= title1_all.size()) {
+      data.title1_all.assign(title1_all.begin() + batch_iter,
+                             title1_all.begin() + batch_end);
+      data.title2_all.assign(title2_all.begin() + batch_iter,
+                             title2_all.begin() + batch_end);
+      data.title3_all.assign(title3_all.begin() + batch_iter,
+                             title3_all.begin() + batch_end);
+      data.l1_all.assign(l1_all.begin() + batch_iter,
+                         l1_all.begin() + batch_end);
+      // Prepare LoDs
+      data.title1_lod.push_back(0);
+      data.title2_lod.push_back(0);
+      data.title3_lod.push_back(0);
+      data.l1_lod.push_back(0);
+      CHECK(!data.title1_all.empty());
+      CHECK(!data.title2_all.empty());
+      CHECK(!data.title3_all.empty());
+      CHECK(!data.l1_all.empty());
+      CHECK_EQ(data.title1_all.size(), data.title2_all.size());
+      CHECK_EQ(data.title1_all.size(), data.title3_all.size());
+      CHECK_EQ(data.title1_all.size(), data.l1_all.size());
+      for (size_t j = 0; j < data.title1_all.size(); j++) {
+        data.title1.push_back(data.title1_all[j]);
+        data.title2.push_back(data.title2_all[j]);
+        data.title3.push_back(data.title3_all[j]);
+        data.l1.push_back(data.l1_all[j]);
+        // calculate lod
+        data.title1_lod.push_back(data.title1_lod.back() +
+                                  data.title1_all[j].size());
+        data.title2_lod.push_back(data.title2_lod.back() +
+                                  data.title2_all[j].size());
+        data.title3_lod.push_back(data.title3_lod.back() +
+                                  data.title3_all[j].size());
+        data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size());
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, '\t', &data);
+      // load title1 data
+      std::vector<int64_t> title1_data;
+      split_to_int64(data[0], ' ', &title1_data);
+      // load title2 data
+      std::vector<int64_t> title2_data;
+      split_to_int64(data[1], ' ', &title2_data);
+      // load title3 data
+      std::vector<int64_t> title3_data;
+      split_to_int64(data[2], ' ', &title3_data);
+      // load l1 data
+      std::vector<int64_t> l1_data;
+      split_to_int64(data[3], ' ', &l1_data);
+      title1_all.push_back(std::move(title1_data));
+      title2_all.push_back(std::move(title2_data));
+      title3_all.push_back(std::move(title3_data));
+      l1_all.push_back(std::move(l1_data));
+    }
+    num_samples = num_lines;
+  }
+};
+
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor title1_tensor, title2_tensor, title3_tensor, l1_tensor;
+  title1_tensor.name = "title1";
+  title2_tensor.name = "title2";
+  title3_tensor.name = "title3";
+  l1_tensor.name = "l1";
+  auto one_batch = data->NextBatch();
+  int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1];
+  title1_tensor.shape.assign({title1_size, 1});
+  title1_tensor.lod.assign({one_batch.title1_lod});
+  int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1];
+  title2_tensor.shape.assign({title2_size, 1});
+  title2_tensor.lod.assign({one_batch.title2_lod});
+  int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1];
+  title3_tensor.shape.assign({title3_size, 1});
+  title3_tensor.lod.assign({one_batch.title3_lod});
+  int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1];
+  l1_tensor.shape.assign({l1_size, 1});
+  l1_tensor.lod.assign({one_batch.l1_lod});
+
+  // assign data
+  TensorAssignData<int64_t>(&title1_tensor, one_batch.title1);
+  TensorAssignData<int64_t>(&title2_tensor, one_batch.title2);
+  TensorAssignData<int64_t>(&title3_tensor, one_batch.title3);
+  TensorAssignData<int64_t>(&l1_tensor, one_batch.l1);
+  // Set inputs.
+  input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor});
+  for (auto &tensor : *input_slots) {
+    tensor.dtype = PaddleDType::INT64;
+  }
+}
+
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->model_dir = FLAGS_infer_model;
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  std::vector<PaddleTensor> input_slots;
+  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
+  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
+  for (int bid = 0; bid < epoch; ++bid) {
+    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+    (*inputs).emplace_back(input_slots);
+  }
+}
+
+// Easy for profiling independently.
+TEST(Analyzer_seq_conv1, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+    // the first inference result
+    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
+    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *result = static_cast<float *>(outputs[0].data.data());
+    // output is probability, which is in (0, 1).
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_GT(result[i], 0);
+      EXPECT_LT(result[i], 1);
+    }
+  }
+}
+
+// Check the fuse status
+TEST(Analyzer_seq_conv1, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+}
+
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_seq_conv1, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
+}
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+using paddle::contrib::MixedRTConfig;
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+NativeConfig GetConfigNative() {
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname;
+  // LOG(INFO) << "dirname  " << config.model_dir;
+  config.fraction_of_gpu_memory = 0.45;
+  config.use_gpu = true;
+  config.device = 0;
+  return config;
+}
+
+MixedRTConfig GetConfigTRT() {
+  MixedRTConfig config;
+  config.model_dir = FLAGS_dirname;
+  config.use_gpu = true;
+  config.fraction_of_gpu_memory = 0.2;
+  config.device = 0;
+  config.max_batch_size = 3;
+  return config;
+}
+
+void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
+  NativeConfig config0 = GetConfigNative();
+  config0.model_dir = model_dirname;
+
+  MixedRTConfig config1 = GetConfigTRT();
+  config1.model_dir = model_dirname;
+  config1.max_batch_size = batch_size;
+
+  auto predictor0 =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 =
+      CreatePaddlePredictor<MixedRTConfig,
+                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
+  // Prepare inputs
+  int height = 224;
+  int width = 224;
+  float *data = new float[batch_size * 3 * height * width];
+  memset(data, 0, sizeof(float) * (batch_size * 3 * height * width));
+  data[0] = 1.0f;
+
+  // Prepare inputs
+  PaddleTensor tensor;
+  tensor.name = "input_0";
+  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+  tensor.data = PaddleBuf(static_cast<void *>(data),
+                          sizeof(float) * (batch_size * 3 * height * width));
+  tensor.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+  // Prepare outputs
+  std::vector<PaddleTensor> outputs0;
+  std::vector<PaddleTensor> outputs1;
+  CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
+
+  CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
+
+  // Get output.
+  ASSERT_EQ(outputs0.size(), 1UL);
+  ASSERT_EQ(outputs1.size(), 1UL);
+
+  const size_t num_elements = outputs0.front().data.length() / sizeof(float);
+  const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
+  EXPECT_EQ(num_elements, num_elements1);
+
+  auto *data0 = static_cast<float *>(outputs0.front().data.data());
+  auto *data1 = static_cast<float *>(outputs1.front().data.data());
+
+  ASSERT_GT(num_elements, 0UL);
+  for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+    EXPECT_NEAR(data0[i], data1[i], 1e-3);
+  }
+}
+
+TEST(trt_models_test, main) {
+  std::vector<std::string> infer_models = {"mobilenet", "resnet50",
+                                           "resnext50"};
+  for (auto &model_dir : infer_models) {
+    CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir);
+  }
+}
+}  // namespace paddle
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/activation_op.h"
 #include <string>
 #include "paddle/fluid/operators/mkldnn_activation_op.h"
+#include "paddle/fluid/platform/port.h"

 namespace paddle {
 namespace operators {
@@ -105,105 +106,105 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
  }
 };

-__attribute__((unused)) constexpr char SigmoidDoc[] = R"DOC(
+UNUSED constexpr char SigmoidDoc[] = R"DOC(
 Sigmoid Activation Operator

 $$out = \frac{1}{1 + e^{-x}}$$

 )DOC";

-__attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC(
+UNUSED constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator

 $$out = \\log \\frac{1}{1 + e^{-x}}$$

 )DOC";

-__attribute__((unused)) constexpr char ExpDoc[] = R"DOC(
+UNUSED constexpr char ExpDoc[] = R"DOC(
 Exp Activation Operator.

 $out = e^x$

 )DOC";

-__attribute__((unused)) constexpr char ReluDoc[] = R"DOC(
+UNUSED constexpr char ReluDoc[] = R"DOC(
 Relu Activation Operator.

 $out = \max(x, 0)$

 )DOC";

-__attribute__((unused)) constexpr char TanhDoc[] = R"DOC(
+UNUSED constexpr char TanhDoc[] = R"DOC(
 Tanh Activation Operator.

 $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

 )DOC";

-__attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC(
+UNUSED constexpr char TanhShrinkDoc[] = R"DOC(
 TanhShrink Activation Operator.

 $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

 )DOC";

-__attribute__((unused)) constexpr char SqrtDoc[] = R"DOC(
+UNUSED constexpr char SqrtDoc[] = R"DOC(
 Sqrt Activation Operator.

 $out = \sqrt{x}$

 )DOC";

-__attribute__((unused)) constexpr char AbsDoc[] = R"DOC(
+UNUSED constexpr char AbsDoc[] = R"DOC(
 Abs Activation Operator.

 $out = |x|$

 )DOC";

-__attribute__((unused)) constexpr char CeilDoc[] = R"DOC(
+UNUSED constexpr char CeilDoc[] = R"DOC(
 Ceil Activation Operator.

 $out = ceil(x)$

 )DOC";

-__attribute__((unused)) constexpr char FloorDoc[] = R"DOC(
+UNUSED constexpr char FloorDoc[] = R"DOC(
 Floor Activation Operator.

 $out = floor(x)$

 )DOC";

-__attribute__((unused)) constexpr char CosDoc[] = R"DOC(
+UNUSED constexpr char CosDoc[] = R"DOC(
 Cosine Activation Operator.

 $out = cos(x)$

 )DOC";

-__attribute__((unused)) constexpr char SinDoc[] = R"DOC(
+UNUSED constexpr char SinDoc[] = R"DOC(
 Sine Activation Operator.

 $out = sin(x)$

 )DOC";

-__attribute__((unused)) constexpr char RoundDoc[] = R"DOC(
+UNUSED constexpr char RoundDoc[] = R"DOC(
 Round Activation Operator.

 $out = [x]$

 )DOC";

-__attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC(
+UNUSED constexpr char ReciprocalDoc[] = R"DOC(
 Reciprocal Activation Operator.

 $$out = \\frac{1}{x}$$

 )DOC";

-__attribute__((unused)) constexpr char LogDoc[] = R"DOC(
+UNUSED constexpr char LogDoc[] = R"DOC(
 Log Activation Operator.

 $out = \ln(x)$
@@ -212,21 +213,21 @@ Natural logarithm of x.

 )DOC";

-__attribute__((unused)) constexpr char SquareDoc[] = R"DOC(
+UNUSED constexpr char SquareDoc[] = R"DOC(
 Square Activation Operator.

 $out = x^2$

 )DOC";

-__attribute__((unused)) constexpr char SoftplusDoc[] = R"DOC(
+UNUSED constexpr char SoftplusDoc[] = R"DOC(
 Softplus Activation Operator.

 $out = \ln(1 + e^{x})$

 )DOC";

-__attribute__((unused)) constexpr char SoftsignDoc[] = R"DOC(
+UNUSED constexpr char SoftsignDoc[] = R"DOC(
 Softsign Activation Operator.

 $$out = \frac{x}{1 + |x|}$$

--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -36,11 +36,16 @@ class AucOp : public framework::OperatorWithKernel {
                      "Out and Label should have same height.");

    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
+    int slide_steps = ctx->Attrs().Get<int>("slide_steps");
+
+    PADDLE_ENFORCE_GE(num_pred_buckets, 1, "num_thresholds must larger than 1");
+    PADDLE_ENFORCE_GE(slide_steps, 0, "slide_steps must be natural number");

    ctx->SetOutputDim("AUC", {1});
-    ctx->SetOutputDim("BatchAUC", {1});
-    ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
-    ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
+
+    slide_steps = slide_steps == 0 ? 1 : slide_steps;
+    ctx->SetOutputDim("StatPosOut", {slide_steps, num_pred_buckets});
+    ctx->SetOutputDim("StatNegOut", {slide_steps, num_pred_buckets});
  }

 protected:
@@ -62,6 +67,7 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Label",
             "A 2D int tensor indicating the label of the training data. "
             "shape: [batch_size, 1]");
+
    // TODO(typhoonzero): support weight input
    AddInput("StatPos", "Statistic value when label = 1");
    AddInput("StatNeg", "Statistic value when label = 0");
@@ -69,18 +75,19 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("AUC",
              "A scalar representing the "
              "current area-under-the-curve.");
-    AddOutput("BatchAUC", "The AUC for current batch");
+
    AddOutput("StatPosOut", "Statistic value when label = 1");
    AddOutput("StatNegOut", "Statistic value when label = 0");

    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
        .SetDefault("ROC");

-    AddAttr<int>("num_thresholds",
-                 "The number of thresholds to use when discretizing the"
-                 " roc curve.")
+    AddAttr<int>(
+        "num_thresholds",
+        "The number of thresholds to use when discretizing the roc curve.")
        .SetDefault((2 << 12) - 1);
-
+    AddAttr<int>("slide_steps", "Use slide steps to calc batch auc.")
+        .SetDefault(1);
    AddComment(R"DOC(
 Area Under The Curve (AUC) Operator.


--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -32,7 +32,9 @@ class AucKernel : public framework::OpKernel<T> {

    std::string curve = ctx.Attr<std::string>("curve");
    int num_thresholds = ctx.Attr<int>("num_thresholds");
+    // buckets contain numbers from 0 to num_thresholds
    int num_pred_buckets = num_thresholds + 1;
+    int slide_steps = ctx.Attr<int>("slide_steps");

    // Only use output var for now, make sure it's persistable and
    // not cleaned up for each batch.
@@ -40,16 +42,19 @@ class AucKernel : public framework::OpKernel<T> {
    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");

-    auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
-    auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
-    calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
-            auc);
+    auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
+    auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());

-    auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
-    std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
-    std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
-    calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
-            num_thresholds, batch_auc);
+    std::vector<int64_t> stat_pos_data(num_pred_buckets, 0);
+    std::vector<int64_t> stat_neg_data(num_pred_buckets, 0);
+
+    auto stat_pos_calc = stat_pos_data.data();
+    auto stat_neg_calc = stat_neg_data.data();
+
+    statAuc(label, predict, num_pred_buckets, num_thresholds, slide_steps,
+            origin_stat_pos, origin_stat_neg, &stat_pos_calc, &stat_neg_calc);
+
+    calcAuc(ctx, stat_pos_calc, stat_neg_calc, num_thresholds, auc);
  }

 private:
@@ -58,29 +63,76 @@ class AucKernel : public framework::OpKernel<T> {
    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
  }

-  inline static void calcAuc(const framework::ExecutionContext &ctx,
-                             const framework::Tensor *label,
+  inline static void statAuc(const framework::Tensor *label,
                             const framework::Tensor *predict,
-                             int64_t *stat_pos, int64_t *stat_neg,
-                             int num_thresholds,
-                             framework::Tensor *auc_tensor) {
+                             const int num_pred_buckets,
+                             const int num_thresholds, const int slide_steps,
+                             int64_t *origin_stat_pos, int64_t *origin_stat_neg,
+                             int64_t **stat_pos, int64_t **stat_neg) {
    size_t batch_size = predict->dims()[0];
    size_t inference_width = predict->dims()[1];
    const T *inference_data = predict->data<T>();
    const auto *label_data = label->data<int64_t>();

-    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
-
    for (size_t i = 0; i < batch_size; i++) {
      uint32_t binIdx = static_cast<uint32_t>(
          inference_data[i * inference_width + 1] * num_thresholds);
      if (label_data[i]) {
-        stat_pos[binIdx] += 1.0;
+        (*stat_pos)[binIdx] += 1.0;
      } else {
-        stat_neg[binIdx] += 1.0;
+        (*stat_neg)[binIdx] += 1.0;
      }
    }

+    int bucket_length = num_pred_buckets * sizeof(int64_t);
+
+    // will stat auc unlimited.
+    if (slide_steps == 0) {
+      for (int slide = 0; slide < num_pred_buckets; ++slide) {
+        origin_stat_pos[slide] += (*stat_pos)[slide];
+        origin_stat_neg[slide] += (*stat_neg)[slide];
+      }
+
+      *stat_pos = origin_stat_pos;
+      *stat_neg = origin_stat_neg;
+
+    } else {
+      for (int slide = 1; slide < slide_steps; ++slide) {
+        int dst_idx = (slide - 1) * num_pred_buckets;
+        int src_inx = slide * num_pred_buckets;
+        std::memcpy(origin_stat_pos + dst_idx, origin_stat_pos + src_inx,
+                    bucket_length);
+        std::memcpy(origin_stat_neg + dst_idx, origin_stat_neg + src_inx,
+                    bucket_length);
+      }
+
+      std::memcpy(origin_stat_pos + (slide_steps - 1) * num_pred_buckets,
+                  *stat_pos, bucket_length);
+      std::memcpy(origin_stat_neg + (slide_steps - 1) * num_pred_buckets,
+                  *stat_neg, bucket_length);
+
+      std::memset(*stat_pos, 0, bucket_length);
+      std::memset(*stat_neg, 0, bucket_length);
+
+      for (int slide = 0; slide < num_pred_buckets; ++slide) {
+        int stat_pos_steps = 0;
+        int stat_neg_steps = 0;
+        for (int step = 0; step < slide_steps; ++step) {
+          stat_pos_steps += origin_stat_pos[slide + step * num_pred_buckets];
+          stat_neg_steps += origin_stat_neg[slide + step * num_pred_buckets];
+        }
+        (*stat_pos)[slide] += stat_pos_steps;
+        (*stat_neg)[slide] += stat_neg_steps;
+      }
+    }
+  }
+
+  inline static void calcAuc(const framework::ExecutionContext &ctx,
+                             int64_t *stat_pos, int64_t *stat_neg,
+                             int num_thresholds,
+                             framework::Tensor *auc_tensor) {
+    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
+
    *auc = 0.0f;

    double totPos = 0.0;
@@ -96,7 +148,6 @@ class AucKernel : public framework::OpKernel<T> {
      totPos += stat_pos[idx];
      totNeg += stat_neg[idx];
      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
-
      --idx;
    }


--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -30,7 +30,13 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
 polygon_box_transform_op.cu)
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
-detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
+
+if(WITH_GPU)
+  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
+else()
+  detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
+endif()
+
 detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
 #Export local libraries to parent
 set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"

@@ -69,7 +70,7 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()),
-        platform::CPUPlace());
+        ctx.device_context());
  }
 };

@@ -162,7 +163,7 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
  const T *im_info_data = im_info.data<T>();
  T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
  T im_scale = im_info_data[2];
-  keep->Resize({boxes->dims()[0], 1});
+  keep->Resize({boxes->dims()[0]});
  min_size = std::max(min_size, 1.0f);
  int *keep_data = keep->mutable_data<int>(ctx.GetPlace());

@@ -463,7 +464,7 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("post_nms_topN", "post_nms_topN");
    AddAttr<float>("nms_thresh", "nms_thres");
    AddAttr<float>("min_size", "min size");
-    AddAttr<float>("eta", "eta");
+    AddAttr<float>("eta", "The parameter for adaptive NMS.");
    AddComment(R"DOC(
 Generate Proposals OP


--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+namespace {
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+int const kThreadsPerBlock = sizeof(uint64_t) * 8;
+
+template <typename T>
+__global__ void RangeInitKernel(const T start, const T delta, const int size,
+                                T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
+}
+
+template <typename T>
+void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
+                    Tensor *value_out, Tensor *index_out) {
+  int num = value.numel();
+  Tensor index_in_t;
+  int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
+  int block = 512;
+  auto stream = ctx.stream();
+  RangeInitKernel<<<DIVUP(num, block), block, 0, stream>>>(0, 1, num, idx_in);
+  int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace());
+
+  const T *keys_in = value.data<T>();
+  T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace());
+
+  // Determine temporary device storage requirements
+  void *d_temp_storage = NULL;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceRadixSort::SortPairsDescending<T, int>(
+      d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
+      num);
+
+  // Allocate temporary storage
+  auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+  d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+
+  // Run sorting operation
+  cub::DeviceRadixSort::SortPairsDescending<T, int>(
+      d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
+      num);
+
+  memory::Free(place, d_temp_storage);
+}
+
+template <typename T>
+__device__ __forceinline__ T Min(T x, T y) {
+  return x < y ? x : y;
+}
+
+template <typename T>
+__device__ __forceinline__ T Max(T x, T y) {
+  return x > y ? x : y;
+}
+
+template <typename T>
+__global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
+                                       const T *var, const int *index,
+                                       const T *im_info, const int num,
+                                       T *proposals) {
+  T kBBoxClipDefault = log(1000.0 / 16.0);
+  CUDA_1D_KERNEL_LOOP(i, num) {
+    int k = index[i] * 4;
+    T axmin = anchor[k];
+    T aymin = anchor[k + 1];
+    T axmax = anchor[k + 2];
+    T aymax = anchor[k + 3];
+
+    T w = axmax - axmin + 1.0;
+    T h = aymax - aymin + 1.0;
+    T cx = axmin + 0.5 * w;
+    T cy = aymin + 0.5 * h;
+
+    T dxmin = deltas[k];
+    T dymin = deltas[k + 1];
+    T dxmax = deltas[k + 2];
+    T dymax = deltas[k + 3];
+
+    T d_cx = 0., d_cy = 0., d_w = 0., d_h = 0.;
+    if (var) {
+      d_cx = cx + dxmin * w * var[k];
+      d_cy = cy + dymin * h * var[k + 1];
+      d_w = exp(Min<T>(dxmax * var[k + 2], kBBoxClipDefault)) * w;
+      d_h = exp(Min<T>(dymax * var[k + 3], kBBoxClipDefault)) * h;
+    } else {
+      d_cx = cx + dxmin * w;
+      d_cy = cy + dymin * h;
+      d_w = exp(Min<T>(dxmax, kBBoxClipDefault)) * w;
+      d_h = exp(Min<T>(dymax, kBBoxClipDefault)) * h;
+    }
+
+    T oxmin = d_cx - d_w * 0.5;
+    T oymin = d_cy - d_h * 0.5;
+    T oxmax = d_cx + d_w * 0.5 - 1.;
+    T oymax = d_cy + d_h * 0.5 - 1.;
+
+    proposals[i * 4] = Max<T>(Min<T>(oxmin, im_info[1] - 1.), 0.);
+    proposals[i * 4 + 1] = Max<T>(Min<T>(oymin, im_info[0] - 1.), 0.);
+    proposals[i * 4 + 2] = Max<T>(Min<T>(oxmax, im_info[1] - 1.), 0.);
+    proposals[i * 4 + 3] = Max<T>(Min<T>(oymax, im_info[0] - 1.), 0.);
+  }
+}
+
+template <typename T, int BlockSize>
+__global__ void FilterBBoxes(const T *bboxes, const T *im_info,
+                             const T min_size, const int num, int *keep_num,
+                             int *keep) {
+  T im_h = im_info[0];
+  T im_w = im_info[1];
+  T im_scale = im_info[2];
+
+  int cnt = 0;
+  __shared__ int keep_index[BlockSize];
+
+  CUDA_1D_KERNEL_LOOP(i, num) {
+    keep_index[threadIdx.x] = -1;
+    __syncthreads();
+
+    int k = i * 4;
+    T xmin = bboxes[k];
+    T ymin = bboxes[k + 1];
+    T xmax = bboxes[k + 2];
+    T ymax = bboxes[k + 3];
+
+    T w = xmax - xmin + 1.0;
+    T h = ymax - ymin + 1.0;
+    T cx = xmin + w / 2.;
+    T cy = ymin + h / 2.;
+
+    T w_s = (xmax - xmin) / im_scale + 1.;
+    T h_s = (ymax - ymin) / im_scale + 1.;
+
+    if (w_s >= min_size && h_s >= min_size && cx <= im_w && cy <= im_h) {
+      keep_index[threadIdx.x] = i;
+    }
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      int size = (num - i) < BlockSize ? num - i : BlockSize;
+      for (int j = 0; j < size; ++j) {
+        if (keep_index[j] > -1) {
+          keep[cnt++] = keep_index[j];
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (threadIdx.x == 0) {
+    keep_num[0] = cnt;
+  }
+}
+
+__device__ inline float IoU(const float *a, const float *b) {
+  float left = max(a[0], b[0]), right = min(a[2], b[2]);
+  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  float inter_s = width * height;
+  float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+  float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  return inter_s / (s_a + s_b - inter_s);
+}
+
+__global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
+                          const float *dev_boxes, uint64_t *dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  const int row_size =
+      min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock);
+
+  __shared__ float block_boxes[kThreadsPerBlock * 4];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 4 + 0] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0];
+    block_boxes[threadIdx.x * 4 + 1] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1];
+    block_boxes[threadIdx.x * 4 + 2] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2];
+    block_boxes[threadIdx.x * 4 + 3] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x;
+    const float *cur_box = dev_boxes + cur_box_idx * 4;
+    int i = 0;
+    uint64_t t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+template <typename T>
+void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
+         const Tensor &sorted_indices, const T nms_threshold,
+         Tensor *keep_out) {
+  int boxes_num = proposals.dims()[0];
+  PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]);
+
+  const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);
+  dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock),
+              DIVUP(boxes_num, kThreadsPerBlock));
+  dim3 threads(kThreadsPerBlock);
+
+  const T *boxes = proposals.data<T>();
+  auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+  int size_bytes = boxes_num * col_blocks * sizeof(uint64_t);
+  uint64_t *d_mask =
+      reinterpret_cast<uint64_t *>(memory::Alloc(place, size_bytes));
+  NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes, d_mask);
+  uint64_t *h_mask = reinterpret_cast<uint64_t *>(
+      memory::Alloc(platform::CPUPlace(), size_bytes));
+  memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0);
+
+  std::vector<uint64_t> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
+
+  std::vector<int> keep_vec;
+  int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / kThreadsPerBlock;
+    int inblock = i % kThreadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      ++num_to_keep;
+      keep_vec.push_back(i);
+      uint64_t *p = &h_mask[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+  int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace());
+  memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(),
+               sizeof(int) * num_to_keep, 0);
+  memory::Free(place, d_mask);
+  memory::Free(platform::CPUPlace(), h_mask);
+}
+
+template <typename T>
+std::pair<Tensor, Tensor> ProposalForOneImage(
+    const platform::CUDADeviceContext &ctx, const Tensor &im_info,
+    const Tensor &anchors, const Tensor &variances,
+    const Tensor &bbox_deltas,  // [M, 4]
+    const Tensor &scores,       // [N, 1]
+    int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
+    float eta) {
+  // 1. pre nms
+  Tensor scores_sort, index_sort;
+  SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
+  int num = scores.numel();
+  int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
+                                                                : pre_nms_top_n;
+  scores_sort.Resize({pre_nms_num, 1});
+  index_sort.Resize({pre_nms_num, 1});
+
+  // 2. box decode and clipping
+  Tensor proposals;
+  proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
+  int block = 512;
+  auto stream = ctx.stream();
+  BoxDecodeAndClipKernel<T><<<DIVUP(pre_nms_num, block), block, 0, stream>>>(
+      anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
+      index_sort.data<int>(), im_info.data<T>(), pre_nms_num,
+      proposals.data<T>());
+
+  // 3. filter
+  Tensor keep_index, keep_num_t;
+  keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
+  keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
+  min_size = std::max(min_size, 1.0f);
+  FilterBBoxes<T, 512><<<1, 512, 0, stream>>>(
+      proposals.data<T>(), im_info.data<T>(), min_size, pre_nms_num,
+      keep_num_t.data<int>(), keep_index.data<int>());
+  int keep_num;
+  const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+  memory::Copy(platform::CPUPlace(), &keep_num, gpu_place,
+               keep_num_t.data<int>(), sizeof(int), 0);
+  keep_index.Resize({keep_num});
+
+  Tensor scores_filter, proposals_filter;
+  proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
+  scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
+  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+
+  if (nms_thresh <= 0) {
+    return std::make_pair(proposals_filter, scores_filter);
+  }
+
+  // 4. nms
+  Tensor keep_nms;
+  NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
+  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
+    keep_nms.Resize({post_nms_top_n});
+  }
+
+  Tensor scores_nms, proposals_nms;
+  proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
+  scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
+  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+
+  return std::make_pair(proposals_nms, scores_nms);
+}
+}  // namespace
+
+template <typename DeviceContext, typename T>
+class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *scores = context.Input<Tensor>("Scores");
+    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
+    auto *im_info = context.Input<Tensor>("ImInfo");
+    auto *anchors = context.Input<Tensor>("Anchors");
+    auto *variances = context.Input<Tensor>("Variances");
+
+    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
+    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
+
+    int pre_nms_top_n = context.Attr<int>("pre_nms_topN");
+    int post_nms_top_n = context.Attr<int>("post_nms_topN");
+    float nms_thresh = context.Attr<float>("nms_thresh");
+    float min_size = context.Attr<float>("min_size");
+    float eta = context.Attr<float>("eta");
+    PADDLE_ENFORCE_GE(eta, 1., "Not support adaptive NMS.");
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+
+    auto scores_dim = scores->dims();
+    int64_t num = scores_dim[0];
+    int64_t c_score = scores_dim[1];
+    int64_t h_score = scores_dim[2];
+    int64_t w_score = scores_dim[3];
+
+    auto bbox_dim = bbox_deltas->dims();
+    int64_t c_bbox = bbox_dim[1];
+    int64_t h_bbox = bbox_dim[2];
+    int64_t w_bbox = bbox_dim[3];
+
+    Tensor bbox_deltas_swap, scores_swap;
+    bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
+                                     dev_ctx.GetPlace());
+    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
+                                dev_ctx.GetPlace());
+
+    math::Transpose<DeviceContext, T, 4> trans;
+    std::vector<int> axis = {0, 2, 3, 1};
+    trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
+    trans(dev_ctx, *scores, &scores_swap, axis);
+
+    Tensor *anchor = const_cast<framework::Tensor *>(anchors);
+    anchor->Resize({anchors->numel() / 4, 4});
+    Tensor *var = const_cast<framework::Tensor *>(variances);
+    var->Resize({var->numel() / 4, 4});
+
+    rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
+                              context.GetPlace());
+    rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
+
+    T *rpn_rois_data = rpn_rois->data<T>();
+    T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
+
+    auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
+
+    int64_t num_proposals = 0;
+    std::vector<size_t> offset(1, 0);
+    for (int64_t i = 0; i < num; ++i) {
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+
+      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
+      scores_slice.Resize({h_score * w_score * c_score, 1});
+
+      std::pair<Tensor, Tensor> box_score_pair =
+          ProposalForOneImage<T>(dev_ctx, im_info_slice, *anchor, *var,
+                                 bbox_deltas_slice, scores_slice, pre_nms_top_n,
+                                 post_nms_top_n, nms_thresh, min_size, eta);
+
+      Tensor proposals = box_score_pair.first;
+      Tensor scores = box_score_pair.second;
+
+      memory::Copy(place, rpn_rois_data + num_proposals * 4, place,
+                   proposals.data<T>(), sizeof(T) * proposals.numel(), 0);
+      memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
+                   scores.data<T>(), sizeof(T) * scores.numel(), 0);
+      num_proposals += proposals.dims()[0];
+      offset.emplace_back(num_proposals);
+    }
+    framework::LoD lod;
+    lod.emplace_back(offset);
+    rpn_rois->set_lod(lod);
+    rpn_roi_probs->set_lod(lod);
+    rpn_rois->Resize({num_proposals, 4});
+    rpn_roi_probs->Resize({num_proposals, 1});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(generate_proposals,
+                        ops::CUDAGenerateProposalsKernel<
+                            paddle::platform::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
    int class_num = ctx.Attr<int>("class_num");

-    auto label_lod = in_label->lod();
-    auto detect_lod = in_detect->lod();
+    auto& label_lod = in_label->lod();
+    auto& detect_lod = in_detect->lod();
    PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
                      "Only support one level sequence now.");
    PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto labels = framework::EigenTensor<T, 2>::From(input_label);
    auto detect = framework::EigenTensor<T, 2>::From(input_detect);

-    auto label_lod = input_label.lod();
-    auto detect_lod = input_detect.lod();
+    auto& label_lod = input_label.lod();
+    auto& detect_lod = input_detect.lod();

    int batch_size = label_lod[0].size() - 1;
-    auto label_index = label_lod[0];
+    auto& label_index = label_lod[0];

    for (int n = 0; n < batch_size; ++n) {
      std::map<int, std::vector<Box>> boxes;
@@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {

    output_true_pos->set_lod(true_pos_lod);
    output_false_pos->set_lod(false_pos_lod);
-    return;
  }

  void GetInputPos(const framework::Tensor& input_pos_count,
@@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto SetData = [](const framework::LoDTensor& pos_tensor,
                      std::map<int, std::vector<std::pair<T, int>>>& pos) {
      const T* pos_data = pos_tensor.data<T>();
-      auto pos_data_lod = pos_tensor.lod()[0];
+      auto& pos_data_lod = pos_tensor.lod()[0];
      for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
        for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
          T score = pos_data[j * 2];
@@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
      std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
    int batch_size = gt_boxes.size();
    for (int n = 0; n < batch_size; ++n) {
-      auto image_gt_boxes = gt_boxes[n];
-      for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) {
+      auto& image_gt_boxes = gt_boxes[n];
+      for (auto& image_gt_box : image_gt_boxes) {
        size_t count = 0;
-        auto labeled_bboxes = it->second;
+        auto& labeled_bboxes = image_gt_box.second;
        if (evaluate_difficult) {
          count = labeled_bboxes.size();
        } else {
-          for (size_t i = 0; i < labeled_bboxes.size(); ++i)
-            if (!(labeled_bboxes[i].is_difficult)) ++count;
+          for (auto& box : labeled_bboxes) {
+            if (!box.is_difficult) {
+              ++count;
+            }
+          }
        }
        if (count == 0) {
          continue;
        }
-        int label = it->first;
+        int label = image_gt_box.first;
        if (label_pos_count->find(label) == label_pos_count->end()) {
          (*label_pos_count)[label] = count;
        } else {

--- a/paddle/fluid/operators/extract_rows_op.cc
+++ b/paddle/fluid/operators/extract_rows_op.cc
@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
    auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
    auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();

-    auto in_rows = in.rows();
+    auto &in_rows = in.rows();
    auto out_dim = framework::make_ddim(
        std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
    auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());

--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -76,12 +76,18 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
  PADDLE_ENFORCE_EQ(b_dims[0], 1,
                    "The first dimension of Input(Bias) should be 1.");
-  PADDLE_ENFORCE_EQ(
-      b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
-      "The second dimension of Input(Bias) should be "
-      "7 * %d if enable peepholes connection or"
-      "4 * %d if disable peepholes",
-      frame_size, frame_size);
+  if (ctx->Attrs().Get<bool>("use_peepholes")) {
+    PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
+                      "The second dimension of Input(Bias) should be "
+                      "7 * %d if enable peepholes connection",
+                      frame_size);
+    ctx->SetOutputDim("CheckedCell", {2, frame_size});
+  } else {
+    PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                      "The second dimension of Input(Bias) should be "
+                      "4 * %d if disable peepholes",
+                      frame_size);
+  }

  framework::DDim out_dims({x_dims[0], frame_size});
  ctx->SetOutputDim("Hidden", out_dims);
@@ -173,6 +179,8 @@ void FusionLSTMOpMaker::Make() {
  AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
  AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
  AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
+  AddOutput("CheckedCell", "(Tensor) (2 x D) only for peephole.")
+      .AsIntermediate();
  AddAttr<bool>("use_peepholes",
                "(bool, defalut: True) "
                "whether to enable diagonal/peephole connections.")
@@ -250,19 +258,19 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
  const int D3 = D * 3;                  \
  const int D4 = wh_dims[1];

-#define INIT_BASE_INPUT_DATAS                                        \
-  const T* x_data = x->data<T>();                                    \
-  const T* wx_data = wx->data<T>();                                  \
-  const T* wh_data = wh->data<T>();                                  \
-  /* diagonal weight*/                                               \
-  const T* wc_data = bias->data<T>() + D4;                           \
-  /* for peephole only*/                                             \
-  Tensor checked_cell;                                               \
-  T* checked_cell_data = nullptr;                                    \
-  auto place = ctx.GetPlace();                                       \
-  if (use_peepholes) {                                               \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                 \
-    checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \
+#define INIT_BASE_INPUT_DATAS                                 \
+  const T* x_data = x->data<T>();                             \
+  const T* wx_data = wx->data<T>();                           \
+  const T* wh_data = wh->data<T>();                           \
+  /* diagonal weight*/                                        \
+  const T* wc_data = bias->data<T>() + D4;                    \
+  /* for peephole only*/                                      \
+  T* checked_cell_data = nullptr;                             \
+  auto place = ctx.GetPlace();                                \
+  if (use_peepholes) {                                        \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/          \
+    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");   \
+    checked_cell_data = checked_cell->mutable_data<T>(place); \
  }

 /// Compute LSTM

--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());

      // TODO(yuyang18): Strange code here.
-      memory::Copy(platform::CPUPlace(),
-                   new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
-                   ids_data, ids_num * sizeof(int64_t), stream);
-
+      memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
+                   gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
      d_table->set_rows(new_rows);

      auto *d_table_value = d_table->mutable_value();

--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -60,11 +60,9 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
    auto out_place = context.GetPlace();
    PADDLE_ENFORCE(platform::is_gpu_place(out_place));

-    memory::Copy(
-        boost::get<platform::CUDAPlace>(out_place), out_data,
-        boost::get<platform::CUDAPlace>(in1_place), in1_data,
-        in1_value.numel() * sizeof(T),
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+    memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
+                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T), context.stream());

    auto* in2_data = in2_value.data<T>();
    memory::Copy(boost::get<platform::CUDAPlace>(out_place),
@@ -148,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
    auto in1_height = input1.height();
    PADDLE_ENFORCE_EQ(in1_height, input2->height());

-    framework::Vector<int64_t> in1_rows(input1.rows());
+    auto& in1_rows = input1.rows();
    auto& in2_rows = *(input2->mutable_rows());

    auto& in1_value = input1.value();

--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -46,6 +46,25 @@ static std::string gethash(const memory::dims& input_dims,
         dims2str(paddings) + pooling_type + suffix;
 }

+static inline int ComputeCeiledOutput(int input_size, int kernel_size,
+                                      int padding, int stride) {
+  return (input_size - kernel_size + 2 * padding) / stride + 1;
+}
+
+static inline void CorrectOutputSize(
+    const std::vector<int>& src_tz, const std::vector<int>& dst_tz,
+    const std::vector<int>& kernel_size, const std::vector<int>& paddings,
+    const std::vector<int>& strides,
+    std::vector<int>& right_bot_padding) {  // NOLINT
+  for (size_t i = 0; i < right_bot_padding.size(); i++) {
+    int desired_size = ComputeCeiledOutput(src_tz[i + 2], kernel_size[i],
+                                           paddings[i], strides[i]);
+    if (desired_size != dst_tz[i + 2]) {
+      right_bot_padding[i] += strides[i];
+    }
+  }
+}
+
 template <typename T>
 class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 public:
@@ -103,6 +122,13 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto pool_p =
        std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
    if (pool_p == nullptr) {
+      const std::vector<int>& padding_left_top(paddings);
+      std::vector<int> padding_right_bottom(paddings);
+      bool ceil_mode = ctx.Attr<bool>("ceil_mode");
+      if (ceil_mode) {
+        CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
+                          padding_right_bottom);
+      }
      auto src_md = platform::MKLDNNMemDesc(
          src_tz, platform::MKLDNNGetDataType<T>(), input_format);

@@ -114,8 +140,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                            mkldnn::memory::format::any);

      std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
-          CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
-                              pooling_type, mkldnn_engine);
+          CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top,
+                              padding_right_bottom, ksize, pooling_type,
+                              mkldnn_engine, ceil_mode);

      // save pool_pd into global device context to be referred in backward path
      dev_ctx.SetBlob(key_pool_pd, pool_pd);
@@ -171,14 +198,16 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 private:
  std::unique_ptr<mkldnn::pooling_forward::primitive_desc> CreatePrimitiveDesc(
      const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst,
-      const std::vector<int>& stride, const std::vector<int>& padding,
-      const std::vector<int>& kernel, const std::string& pooling_type,
-      const mkldnn::engine& engine) const {
+      const std::vector<int>& stride, const std::vector<int>& padding_left_top,
+      const std::vector<int>& padding_right_bot, const std::vector<int>& kernel,
+      const std::string& pooling_type, const mkldnn::engine& engine,
+      bool ceil_mode) const {
    auto pool_desc = mkldnn::pooling_forward::desc(
        mkldnn::prop_kind::forward,
        pooling_type == "max" ? mkldnn::algorithm::pooling_max
                              : mkldnn::algorithm::pooling_avg,
-        src, dst, stride, kernel, padding, padding, mkldnn::padding_kind::zero);
+        src, dst, stride, kernel, padding_left_top, padding_right_bot,
+        mkldnn::padding_kind::zero);

    auto p_pool_pd =
        new mkldnn::pooling_forward::primitive_desc(pool_desc, engine);

--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
@@ -53,15 +53,16 @@ class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
 SamplingId Operator.
 A layer for sampling id from multinomial distribution from the
 input. Sampling one id for one sample.)DOC");
-    AddAttr<float>("min", "Minimum value of random. [default 0.0].")
+    AddAttr<float>("min", "Minimum value of random. (float, default 0.0).")
        .SetDefault(0.0f);
-    AddAttr<float>("max", "Maximun value of random. [default 1.0].")
+    AddAttr<float>("max", "Maximun value of random. (float, default 1.0).")
        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "Random seed used for the random number engine. "
-                 "0 means use a seed generated by the system."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time. [default 0].")
+    AddAttr<int>(
+        "seed",
+        "Random seed used for the random number engine. "
+        "0 means use a seed generated by the system."
+        "Note that if seed is not 0, this operator will always "
+        "generate the same random numbers every time. (int, default 0).")
        .SetDefault(0);
  }
 };

--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -77,8 +77,10 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference {
    auto out_var_name = op_desc.Output("Out").front();
    auto *out_var = block->FindVarRecursive(out_var_name);

-    out_var->SetType(in_var.GetType());
-    out_var->SetDataType(in_var.GetDataType());
+    if (in_var_name != out_var_name) {
+      out_var->SetType(in_var.GetType());
+      out_var->SetDataType(in_var.GetDataType());
+    }
  }
 };


--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
@@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
    }

    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_LT(0, offset_data[i],
+      PADDLE_ENFORCE_LE(0, offset_data[i],
                        "The offset[%d] must greater than zero.", i);
      PADDLE_ENFORCE_LT(0, length_data[i],
                        "The length[%d] must greater than zero.", i);
-      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
+      PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i],
                        lod[0][i + 1], "The target tensor's length overflow.");
    }


--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#define EIGEN_USE_GPU
+#include <algorithm>
 #include "paddle/fluid/operators/sgd_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"

@@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
  }
 }

-template <typename T, int block_size>
+template <typename T>
 __global__ void SparseSGDFunctorKernel(const T* selected_rows,
                                       const int64_t* rows,
                                       const T* learning_rate, T* tensor_out,
-                                       int64_t row_numel) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  selected_rows += ty * row_numel;
-  tensor_out += rows[ty] * row_numel;
-
-  for (int index = tid; index < row_numel; index += block_size) {
-    // Since index in rows of SelectedRows can be duplicate, we have to use
-    // Atomic Operation to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(
-        tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]);
+                                       int64_t row_numel, int64_t limit) {
+  for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) {
+    const T* selected_rows_ptr = selected_rows + i * row_numel;
+    T* tensor_out_ptr = tensor_out + rows[i] * row_numel;
+    for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
+      // Since index in rows of SelectedRows can be duplicate, we have to use
+      // Atomic Operation to avoid concurrent write error.
+      paddle::platform::CudaAtomicAdd(
+          tensor_out_ptr + index,
+          -1.0 * learning_rate[0] * selected_rows_ptr[index]);
+    }
  }
 }
 }  // namespace
@@ -89,7 +88,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);

      auto& in_value = grad->value();
-      framework::Vector<int64_t> in_rows(grad->rows());
+      auto& in_rows = grad->rows();

      int64_t in_row_numel = in_value.numel() / in_rows.size();
      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
@@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
      auto* in_data = in_value.data<T>();
      auto* out_data = param_out->data<T>();

-      const int block_size = 256;
-      dim3 threads(block_size, 1);
-      dim3 grid(1, in_rows.size());
-      SparseSGDFunctorKernel<
-          T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+      const int kThreadsPerBlock = 256;
+      int thread_x = kThreadsPerBlock;
+      int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount();
+      int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+      SparseSGDFunctorKernel<<<max_blocks, thread_x, 0,
+                               ctx.cuda_device_context().stream()>>>(
          in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
-          out_data, in_row_numel);
+          out_data, in_row_numel, in_rows.size());

    } else {
      PADDLE_THROW("Unsupported Variable Type of Grad");

--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -32,7 +32,7 @@ class SumKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
    auto in_vars = context.MultiInputVar("X");
-    int N = in_vars.size();
+    size_t in_num = in_vars.size();
    auto out_var = context.OutputVar("Out");

    bool in_place = out_var == in_vars[0];
@@ -53,7 +53,7 @@ class SumKernel : public framework::OpKernel<T> {
      auto &place =
          *context.template device_context<DeviceContext>().eigen_device();
      // If in_place, just skip the first tensor
-      for (int i = in_place ? 1 : 0; i < N; i++) {
+      for (size_t i = in_place ? 1 : 0; i < in_num; i++) {
        if (in_vars[i]->IsType<framework::LoDTensor>()) {
          auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
          if (in_t.numel() == 0) {
@@ -101,13 +101,13 @@ class SumKernel : public framework::OpKernel<T> {

      // Runtime InferShape
      size_t first_dim = 0;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
        auto &sel_row = get_selected_row(i);
        first_dim += sel_row.rows().size();
      }

      std::vector<int64_t> in_dim;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
        auto &sel_row = get_selected_row(i);
        if (sel_row.rows().size() > 0) {
          in_dim = framework::vectorize(sel_row.value().dims());
@@ -116,14 +116,14 @@ class SumKernel : public framework::OpKernel<T> {
      }
      if (in_dim.empty()) {
        VLOG(3) << "WARNING: all the inputs are empty";
-        in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
+        in_dim =
+            framework::vectorize(get_selected_row(in_num - 1).value().dims());
      } else {
        in_dim[0] = static_cast<int64_t>(first_dim);
      }

      out_value->Resize(framework::make_ddim(in_dim));
      out_value->mutable_data<T>(context.GetPlace());
-
      // if all the input sparse vars are empty, no need to
      // merge these vars.
      if (first_dim == 0UL) {
@@ -133,7 +133,7 @@ class SumKernel : public framework::OpKernel<T> {
      math::SelectedRowsAddTo<DeviceContext, T> functor;

      int64_t offset = 0;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
        auto &sel_row = get_selected_row(i);
        if (sel_row.rows().size() == 0) {
          continue;

--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -22,8 +22,6 @@
 namespace paddle {

 DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
-DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size");
-DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size");

 namespace operators {

@@ -34,6 +32,8 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Ys", "A list of outputs").AsDuplicable();
    AddAttr<std::string>("subgraph", "the subgraph.");
    AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
+    AddAttr<int>("max_batch_size", "the maximum batch size.");
+    AddAttr<int>("workspace_size", "the workspace size.");
    AddComment("TensorRT engine operator.");
  }
 };

--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -28,8 +28,6 @@
 namespace paddle {

 DECLARE_int32(tensorrt_engine_batch_size);
-DECLARE_int32(tensorrt_max_batch_size);
-DECLARE_int32(tensorrt_workspace_size);

 namespace operators {

@@ -92,14 +90,14 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto engine_name = context.Attr<std::string>("engine_uniq_key");
+    int max_batch_size = context.Attr<int>("max_batch_size");
    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
      Prepare(context);
    }
    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
    auto input_names = context.op().Inputs("Xs");
    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
-    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
-                      FLAGS_tensorrt_max_batch_size);
+    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);

    std::vector<std::string> output_maps =
        context.Attr<std::vector<std::string>>("output_name_mapping");
@@ -173,8 +171,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
    // Get the ProgramDesc and pass to convert.
    framework::proto::BlockDesc block_desc;
    block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-    int max_batch = FLAGS_tensorrt_max_batch_size;
-    auto max_workspace = FLAGS_tensorrt_workspace_size;
+    int max_batch_size = context.Attr<int>("max_batch_size");
+    int workspace_size = context.Attr<int>("workspace_size");
+
    auto params = context.Attr<std::vector<std::string>>("parameters");
    std::unordered_set<std::string> parameters;
    for (const auto& param : params) {
@@ -186,7 +185,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {

    // TODO(Superjomn) replace this with a different stream
    auto* engine = Singleton<TRT_EngineManager>::Global().Create(
-        max_batch, max_workspace, nullptr /*engine hold its own stream*/,
+        max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
        context.Attr<std::string>("engine_uniq_key"),
        boost::get<platform::CUDAPlace>(context.GetPlace()).device);


--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -58,8 +58,6 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
 using inference::analysis::SetAttr;

 TEST(TensorRTEngineOp, manual) {
-  FLAGS_tensorrt_engine_batch_size = 2;
-  FLAGS_tensorrt_max_batch_size = 2;
  framework::ProgramDesc program;
  auto* block_ = program.Proto()->add_blocks();
  block_->set_idx(0);
@@ -101,6 +99,8 @@ TEST(TensorRTEngineOp, manual) {
  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                       block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                    std::vector<std::string>({}));
@@ -129,8 +129,6 @@ TEST(TensorRTEngineOp, manual) {
 }

 void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
-  FLAGS_tensorrt_engine_batch_size = batch_size;
-  FLAGS_tensorrt_max_batch_size = batch_size;
  framework::ProgramDesc program;
  framework::Scope scope;
  platform::CUDAPlace place;
@@ -195,8 +193,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {

  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                       block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size);
-  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10);
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
  SetAttr<std::vector<std::string>>(
      engine_op_desc.Proto(), "parameters",
      std::vector<std::string>({"y0", "y1", "y2", "y3"}));

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #if defined(_WIN32)
 #define NOMINMAX  // msvc max/min macro conflict with std::min/max
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#define GOOGLE_GLOG_DLL_DECL
 #endif

 #ifdef PADDLE_WITH_CUDA
@@ -47,7 +48,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
-#if !defined(__APPLE__) and !defined(_WIN32)
+#if !defined(__APPLE__) && !defined(_WIN32)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
@@ -216,7 +217,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
 #endif
 }

-#if !defined(__APPLE__) and !defined(_WIN32)
+#if !defined(__APPLE__) && !defined(_WIN32)
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
    ncclResult_t stat, const Args&... args) {
@@ -260,14 +261,8 @@ inline void throw_on_error(T e) {
    }                                                                   \
  } while (false)

-#define PADDLE_THROW_EOF()                                                     \
-  do {                                                                         \
-    throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
-                                           __LINE__);                          \
-  } while (false)
-
 #else
-#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__)
+#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
 #endif  // REPLACE_ENFORCE_GLOG

 #else  // !_WIN32
@@ -281,6 +276,12 @@ inline void throw_on_error(T e) {
 #define PADDLE_ENFORCE(x, ...) x
 #endif  // !_WIN32

+#define PADDLE_THROW_EOF()                                                     \
+  do {                                                                         \
+    throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
+                                           __LINE__);                          \
+  } while (false)
+
 /*
 * Some enforce helpers here, usage:
 *    int a = 1;
@@ -294,7 +295,7 @@ inline void throw_on_error(T e) {
 *    extra messages is also supported, for example:
 *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
 */
-
+#if !defined(_WIN32)
 #define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
 #define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
@@ -307,6 +308,7 @@ inline void throw_on_error(T e) {
  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
 #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
+
 #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
  do {                                                       \
    if (UNLIKELY(nullptr == (__VAL))) {                      \
@@ -326,6 +328,27 @@ inline void throw_on_error(T e) {
                   paddle::string::Sprintf("" __VA_ARGS__));            \
    }                                                                   \
  } while (0)
+#else
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1))
+#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1))
+#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1))
+#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1))
+#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1))
+#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1))
+
+#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
+  do {                                                                 \
+    if (!((__VAL0)__CMP(__VAL1))) {                                    \
+      PADDLE_THROW("Windows disable the enforce. Enforce failed.");    \
+    }                                                                  \
+  } while (0)
+#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...)                       \
+  do {                                                             \
+    if (nullptr == (__VAL1)) {                                     \
+      PADDLE_THROW("Windows disable the enforce. Enforce failed"); \
+    }                                                              \
+  } while (0)
+#endif  // !_WIN32

 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt

-set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method)
+set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method pass_builder)
 set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
 if(NOT WIN32)
 list(APPEND PYBIND_DEPS parallel_executor profiler)

--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -285,12 +285,12 @@ void BindOpDesc(pybind11::module *m) {
      .def("set_output", &pd::OpDesc::SetOutput)
      .def("input_arg_names", &pd::OpDesc::InputArgumentNames)
      .def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
-      .def("rename_input", &pd::OpDesc::RenameInput)
-      .def("rename_output", &pd::OpDesc::RenameOutput)
+      .def("_rename_input", &pd::OpDesc::RenameInput)
+      .def("_rename_output", &pd::OpDesc::RenameOutput)
      .def("has_attr", &pd::OpDesc::HasAttr)
      .def("attr_type", &pd::OpDesc::GetAttrType)
      .def("attr_names", &pd::OpDesc::AttrNames)
-      .def("set_attr", &pd::OpDesc::SetAttr)
+      .def("_set_attr", &pd::OpDesc::SetAttr)
      .def("attr", &pd::OpDesc::GetAttr)
      .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
      .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr)
@@ -300,8 +300,8 @@ void BindOpDesc(pybind11::module *m) {
             std::string ser(seriralized);
             self.SetAttr(name, ser);
           })
-      .def("block_attr_id", &pd::OpDesc::GetBlockAttrId)
-      .def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
+      .def("_block_attr_id", &pd::OpDesc::GetBlockAttrId)
+      .def("_blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
      .def("check_attrs", &pd::OpDesc::CheckAttrs)
      .def("infer_shape", &pd::OpDesc::InferShape)
      .def("infer_var_type", &pd::OpDesc::InferVarType)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -595,6 +596,29 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("is_profiler_enabled", platform::IsProfileEnabled);
  m.def("reset_profiler", platform::ResetProfiler);

+  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
+  pass.def(py::init())
+      .def("set_str", [](ir::Pass &self, const std::string &name,
+                         const std::string &attr) {
+        self.Set<std::string>(name, new std::string(attr));
+      });
+
+  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
+      m, "PassBuilder");
+  pb.def(py::init())
+      .def("append_pass",
+           [](ir::PassBuilder &self,
+              const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
+             return self.AppendPass(pass_type);
+           })
+      .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
+      .def("insert_pass",
+           [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
+             return self.InsertPass(idx, pass_type);
+           })
+      .def("remove_pass",
+           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
+
  // -- python binds for parallel executor.
  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
@@ -677,7 +701,11 @@ All parameter, weight, gradient are variables in Paddle.
                    },
                    [](BuildStrategy &self, bool b) {
                      self.fuse_elewise_add_act_ops_ = b;
-                    });
+                    })
+      .def("_create_passes_from_strategy",
+           [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
+             return self.CreatePassesFromStrategy();
+           });

  pe.def(py::init<const std::vector<platform::Place> &,
                  const std::unordered_set<std::string> &,

--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
+function(train_test TARGET_NAME)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+    set(arg_list "")
+    if(train_test_ARGS)
+        foreach(arg ${train_test_ARGS})
+            list(APPEND arg_list "_${arg}")
+        endforeach()
+    else()
+        list(APPEND arg_list "_")
+    endif()
+    foreach(arg ${arg_list})
+        string(REGEX REPLACE "^_$" "" arg "${arg}")
+        cc_test(test_train_${TARGET_NAME}${arg}
+                SRCS test_train_${TARGET_NAME}.cc
+                DEPS paddle_fluid_origin
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
+        set_tests_properties(test_train_${TARGET_NAME}${arg}
+                PROPERTIES DEPENDS test_${TARGET_NAME})
+    endforeach()
+endfunction(train_test)
+
+
+if(WITH_TESTING)
+  train_test(recognize_digits ARGS mlp conv)
+endif()
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <time.h>
+#include <fstream>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/place.h"
+
+DEFINE_string(dirname, "", "Directory of the train model.");
+
+namespace paddle {
+
+void Train() {
+  CHECK(!FLAGS_dirname.empty());
+  framework::InitDevices(false);
+  const auto cpu_place = platform::CPUPlace();
+  framework::Executor executor(cpu_place);
+  framework::Scope scope;
+
+  auto train_program = inference::Load(
+      &executor, &scope, FLAGS_dirname + "__model_combined__.main_program",
+      FLAGS_dirname + "__params_combined__");
+
+  std::string loss_name = "";
+  for (auto op_desc : train_program->Block(0).AllOps()) {
+    if (op_desc->Type() == "mean") {
+      loss_name = op_desc->Output("Out")[0];
+      break;
+    }
+  }
+
+  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+
+  // prepare data
+  auto x_var = scope.Var("img");
+  auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
+  x_tensor->Resize({64, 1, 28, 28});
+
+  auto x_data = x_tensor->mutable_data<float>(cpu_place);
+  for (int i = 0; i < 64 * 28 * 28; ++i) {
+    x_data[i] = 1.0;
+  }
+
+  auto y_var = scope.Var("label");
+  auto y_tensor = y_var->GetMutable<framework::LoDTensor>();
+  y_tensor->Resize({64, 1});
+  auto y_data = y_tensor->mutable_data<int64_t>(cpu_place);
+  for (int i = 0; i < 64 * 1; ++i) {
+    y_data[i] = static_cast<int64_t>(1);
+  }
+
+  auto loss_var = scope.Var(loss_name);
+  float first_loss = 0.0;
+  float last_loss = 0.0;
+  for (int i = 0; i < 100; ++i) {
+    executor.Run(*train_program.get(), &scope, 0, false, true);
+    if (i == 0) {
+      first_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
+    } else if (i == 99) {
+      last_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
+    }
+  }
+  EXPECT_LT(last_loss, first_loss);
+}
+
+TEST(train, recognize_digits) { Train(); }
+
+}  // namespace paddle
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -70,8 +70,8 @@ function cmake_gen() {
    PYTHON_FLAGS=""
    SYSTEM=`uname -s`
    if [ "$SYSTEM" == "Darwin" ]; then
+        echo "Using python abi: $1"
        if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then
-            echo "using python abi: $1"
            if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then
                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
@@ -82,7 +82,18 @@ function cmake_gen() {
            else
                exit 1
            fi
-        # TODO: qiyang add python3 part here 
+        elif [ "$1" == "cp35-cp35m" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.5" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
+                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+            else
+                exit 1
+            fi
        fi
    else 
        if [ "$1" != "" ]; then
@@ -381,7 +392,7 @@ function run_mac_test() {
 EOF

        # TODO: jiabin need to refine this part when these tests fixed on mac
-        ctest --output-on-failure -j8     
+        ctest --output-on-failure -j $1     
        # make install should also be test when unittest 
        make install -j 8
        pip install /usr/local/opt/paddle/share/wheels/*.whl
@@ -629,10 +640,10 @@ EOF

 function gen_capi_package() {
    if [[ ${WITH_C_API} == "ON" ]]; then
-        install_prefix="${PADDLE_ROOT}/build/capi_output"
-        rm -rf $install_prefix
-        make DESTDIR="$install_prefix" install
-        cd $install_prefix/usr/local
+        capi_install_prefix=${INSTALL_PREFIX:-/paddle/build}/capi_output
+        rm -rf $capi_install_prefix
+        make DESTDIR="$capi_install_prefix" install
+        cd $capi_install_prefix/
        ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz
    fi
 }
@@ -729,7 +740,11 @@ function main() {
      maccheck)
        cmake_gen ${PYTHON_ABI:-""}
        build_mac
-        run_mac_test
+        run_mac_test ${PROC_RUN:-1}
+        ;;
+      macbuild)
+        cmake_gen ${PYTHON_ABI:-""}
+        build_mac
        ;;
      cicheck_py35)
        cmake_gen ${PYTHON_ABI:-""}

--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -77,13 +77,14 @@ def download(url, module_name, md5sum, save_name=None):
    retry_limit = 3
    while not (os.path.exists(filename) and md5file(filename) == md5sum):
        if os.path.exists(filename):
-            print("file md5", md5file(filename), md5sum)
+            sys.stderr.write("file %s  md5 %s" % (md5file(filename), md5sum))
        if retry < retry_limit:
            retry += 1
        else:
            raise RuntimeError("Cannot download {0} within retry limit {1}".
                               format(url, retry_limit))
-        print("Cache file %s not found, downloading %s" % (filename, url))
+        sys.stderr.write("Cache file %s not found, downloading %s" %
+                         (filename, url))
        r = requests.get(url, stream=True)
        total_length = r.headers.get('content-length')

@@ -100,10 +101,11 @@ def download(url, module_name, md5sum, save_name=None):
                    dl += len(data)
                    f.write(data)
                    done = int(50 * dl / total_length)
-                    sys.stdout.write("\r[%s%s]" % ('=' * done,
+                    sys.stderr.write("\r[%s%s]" % ('=' * done,
                                                   ' ' * (50 - done)))
                    sys.stdout.flush()
-
+    sys.stderr.write("\n")
+    sys.stdout.flush()
    return filename



--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -89,7 +89,8 @@ def reader_creator(tar_file, file_name, dict_size):
            ]
            for name in names:
                for line in f.extractfile(name):
-                    line_split = line.strip().split(six.b('\t'))
+                    line = cpt.to_text(line)
+                    line_split = line.strip().split('\t')
                    if len(line_split) != 2:
                        continue
                    src_seq = line_split[0]  # one source sequence

--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -64,7 +64,8 @@ def __build_dict(tar_file, dict_size, save_path, lang):
    word_dict = defaultdict(int)
    with tarfile.open(tar_file, mode="r") as f:
        for line in f.extractfile("wmt16/train"):
-            line_split = line.strip().split(six.b("\t"))
+            line = cpt.to_text(line)
+            line_split = line.strip().split("\t")
            if len(line_split) != 2: continue
            sen = line_split[0] if lang == "en" else line_split[1]
            for w in sen.split():
@@ -123,7 +124,8 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):

        with tarfile.open(tar_file, mode="r") as f:
            for line in f.extractfile(file_name):
-                line_split = line.strip().split(six.b("\t"))
+                line = cpt.to_text(line)
+                line_split = line.strip().split("\t")
                if len(line_split) != 2:
                    continue
                src_words = line_split[src_col].split()

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -38,8 +38,8 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
        op_desc = op_descs[i]
        if isinstance(op_desc, tuple):
            op_desc = op_desc[0]
-        op_desc.rename_input(old_name, new_name)
-        op_desc.rename_output(old_name, new_name)
+        op_desc._rename_input(old_name, new_name)
+        op_desc._rename_output(old_name, new_name)


 def _create_op_desc_(op_type, inputs, outputs, attrs):
@@ -70,7 +70,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
        if isinstance(val, framework.Block):
            op_desc.set_block_attr(name, val.desc)
        else:
-            op_desc.set_attr(name, val)
+            op_desc._set_attr(name, val)
    return op_desc


@@ -346,7 +346,7 @@ def _append_backward_ops_(block,
        grad_sub_block_list = []
        # If the op has its own sub-block, deal with the sub-block first
        if op.has_attr("sub_block"):
-            sub_block = program.block(op.block_attr_id("sub_block"))
+            sub_block = program.block(op._block_attr_id("sub_block"))
            grad_sub_block = program._create_block()
            grad_sub_block._set_forward_block_idx(sub_block.idx)
            cb = _callback_lookup_(op)
@@ -382,7 +382,7 @@ def _append_backward_ops_(block,
    for op_desc in grad_op_descs:
        new_op_desc = target_block.desc.append_op()
        new_op_desc.copy_from(op_desc)
-        new_op_desc.set_attr(op_role_attr_name, backward)
+        new_op_desc._set_attr(op_role_attr_name, backward)
        grad_to_var["__current_op_desc__"] = new_op_desc
        if callbacks is not None:
            assert (isinstance(callbacks, list))
@@ -408,7 +408,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
    for op_idx in range(start_op_idx, block.desc.op_size()):
        op_desc = block.desc.op(op_idx)
        if op_desc.has_attr("sub_block"):
-            sub_block = block.program.block(op_desc.block_attr_id("sub_block"))
+            sub_block = block.program.block(op_desc._block_attr_id("sub_block"))
            _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
        new_vars = set()
        # create new gradient variables
@@ -438,12 +438,12 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
        op_desc = block.desc.op(op_idx)
        for name in op_desc.input_arg_names():
            if name in var_map:
-                op_desc.rename_input(name, var_map[name])
+                op_desc._rename_input(name, var_map[name])

        for name in op_desc.output_arg_names():
            if block.desc.find_var(name.encode("ascii")):
                new_name = unique_name.generate(name)
-                op_desc.rename_output(name, new_name)
+                op_desc._rename_output(name, new_name)
                var_map[name] = new_name

    for g, ng in six.iteritems(var_map):
@@ -542,9 +542,9 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
        if loss.op is None:
            raise ValueError("loss.op is None. Should not happend")

-    loss.op.set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(),
-                     int(core.op_proto_and_checker_maker.OpRole.Forward) |
-                     int(core.op_proto_and_checker_maker.OpRole.Loss))
+    loss.op._set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(),
+                      int(core.op_proto_and_checker_maker.OpRole.Forward) |
+                      int(core.op_proto_and_checker_maker.OpRole.Loss))

    if callbacks is not None:
        isinstance(callbacks, list)
@@ -631,7 +631,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
        attr_val = [p.name, g.name]
        if g.op.has_attr(op_role_var_attr_name):
            attr_val.extend(g.op.attr(op_role_var_attr_name))
-        g.op.set_attr(op_role_var_attr_name, attr_val)
+        g.op._set_attr(op_role_var_attr_name, attr_val)

    return params_and_grads


--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -75,8 +75,8 @@ class ErrorClipByValue(BaseErrorClipAttr):
        clip_op_desc.set_type("clip")
        clip_op_desc.set_input("X", [grad_name])
        clip_op_desc.set_output("Out", [grad_name])
-        clip_op_desc.set_attr("min", self.min)
-        clip_op_desc.set_attr("max", self.max)
+        clip_op_desc._set_attr("min", self.min)
+        clip_op_desc._set_attr("max", self.max)


 def error_clip_callback(block, context):

--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -18,5 +18,10 @@ from . import decoder
 from .decoder import *
 from . import memory_usage_calc
 from .memory_usage_calc import *
+from . import op_frequence
+from .op_frequence import *

-__all__ = decoder.__all__ + memory_usage_calc.__all__
+__all__ = []
+__all__ += decoder.__all__
+__all__ += memory_usage_calc.__all__
+__all__ += op_frequence.__all__
--- a/python/paddle/fluid/contrib/op_frequence.py
+++ b/python/paddle/fluid/contrib/op_frequence.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from collections import OrderedDict
+
+from ..framework import Program
+
+__all__ = ['op_freq_statistic']
+
+
+def op_freq_statistic(program):
+    """
+    Statistics of Op frequency.
+
+    Args:
+        program(Program): The current Program.
+
+    Returns:
+        uni_op_freq(dict): the single op frequency.
+        adj_2_op_freq(dict): the two adjacent ops frequency.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> uni_op_freq, adj_2_op_freq = fluid.contrib.op_freq_statistic(
+        >>>        fluid.default_main_program())
+        >>> for op_type, op_num in uni_op_freq:
+        >>>     print("%s  \t  %d" % (op_type, op_num))
+        >>> for op_type, op_num in adj_2_op_freq:
+        >>>     print("%s  \t  %d" % (op_type, op_num))
+
+    """
+
+    if not isinstance(program, Program):
+        raise TypeError("The input type should be Porgram."
+                        "But you passed in %s" % (type(program)))
+
+    uni_op_freq = OrderedDict()
+    adj_2_op_freq = OrderedDict()
+    op_in_ops = OrderedDict()
+
+    parameters = [p.name for p in program.blocks[0].all_parameters()]
+
+    # get uni_op_freq
+    for op in program.global_block().ops:
+        had_recorded = False
+        for var_name in op.output_arg_names:
+            if var_name in parameters:
+                continue
+            if not had_recorded and uni_op_freq.has_key(op.type):
+                uni_op_freq[op.type] += 1
+                had_recorded = True
+            elif not had_recorded:
+                uni_op_freq[op.type] = 1
+                had_recorded = True
+
+    # get adj_2_op_freq
+    var_gen_op = {}
+    for op in program.global_block().ops:
+        for var_name in op.input_arg_names:
+            if var_name in parameters:
+                continue
+            if var_gen_op.has_key(var_name):
+                assert len(var_gen_op[var_name]) > 0
+                if op_in_ops.has_key(op.type):
+                    op_in_ops[op.type].append(var_gen_op[var_name][-1])
+                else:
+                    op_in_ops[op.type] = [var_gen_op[var_name][-1]]
+            else:
+                print("Var's generate op is not found,%s, %s" %
+                      (var_name, op.type))
+
+        for var_name in op.output_arg_names:
+            if var_gen_op.has_key(var_name):
+                var_gen_op[var_name].append(op.type)
+            else:
+                var_gen_op[var_name] = [op.type]
+
+    for op, in_ops in op_in_ops.iteritems():
+        for in_op in in_ops:
+            op_op = in_op + "->" + op
+            if adj_2_op_freq.has_key(op_op):
+                adj_2_op_freq[op_op] += 1
+            else:
+                adj_2_op_freq[op_op] = 1
+
+    uni_op_freq = sorted(
+        uni_op_freq.items(), key=lambda item: item[1], reverse=True)
+    adj_2_op_freq = sorted(
+        adj_2_op_freq.items(), key=lambda item: item[1], reverse=True)
+
+    return uni_op_freq, adj_2_op_freq
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -40,11 +40,9 @@ PADDLE_ON_MODEL_CE = os.environ.get('PADDLE_ON_MODEL_CE', None) is not None

 __all__ = [
    'Program',
-    'Operator',
    'default_startup_program',
    'default_main_program',
    'program_guard',
-    'get_var',
    'name_scope',
 ]

@@ -663,11 +661,11 @@ class Operator(object):
                self._update_desc_attr(attr_name, attr_val)

        self.desc.check_attrs()
-        if self.has_kernel(type):
+        if self._has_kernel(type):
            self.desc.infer_var_type(self.block.desc)
            self.desc.infer_shape(self.block.desc)

-    def has_kernel(self, op_type):
+    def _has_kernel(self, op_type):
        return op_type not in self.OP_WITHOUT_KERNEL_SET

    def to_string(self, throw_on_error):
@@ -708,7 +706,7 @@ class Operator(object):
        """
        return self.desc.input(name)

-    def rename_input(self, old_name, new_name):
+    def _rename_input(self, old_name, new_name):
        """
        Rename the `old_name` to `new_name`.

@@ -719,9 +717,9 @@ class Operator(object):
        Returns:
            None
        """
-        self.desc.rename_input(old_name, new_name)
+        self.desc._rename_input(old_name, new_name)

-    def rename_output(self, old_name, new_name):
+    def _rename_output(self, old_name, new_name):
        """
        Rename the `old_name` to `new_name`.

@@ -732,7 +730,7 @@ class Operator(object):
        Returns:
            None
        """
-        self.desc.rename_output(old_name, new_name)
+        self.desc._rename_output(old_name, new_name)

    @property
    def input_names(self):
@@ -796,7 +794,7 @@ class Operator(object):
        """
        return self.desc.attr_type(name)

-    def set_attr(self, name, val):
+    def _set_attr(self, name, val):
        """
        Set the value of attribute by attribute's name.

@@ -829,7 +827,7 @@ class Operator(object):
                isinstance(val, core.ProgramDesc):
            self.desc.set_serialized_attr(name, val.serialize_to_string())
        else:
-            self.desc.set_attr(name, val)
+            self.desc._set_attr(name, val)

    @property
    def attr_names(self):
@@ -848,7 +846,7 @@ class Operator(object):
        """
        return self.desc.attr(name)

-    def block_attr_id(self, name):
+    def _block_attr_id(self, name):
        """
        Get the block attribute's id by name.

@@ -858,9 +856,9 @@ class Operator(object):
        Returns:
            int: the block index.
        """
-        return self.desc.block_attr_id(name)
+        return self.desc._block_attr_id(name)

-    def block_attr(self, name):
+    def _block_attr(self, name):
        """
        Get the block attribute  by name.

@@ -871,11 +869,11 @@ class Operator(object):
            block: the block attribute.
        """

-        id = self.block_attr_id(name)
+        id = self._block_attr_id(name)
        assert (id >= 0 and id < len(self.block.program.blocks))
        return self.block.program.blocks[id]

-    def blocks_attr(self, name):
+    def _blocks_attr(self, name):
        """
        Get the blocks attribute  by name.

@@ -886,13 +884,13 @@ class Operator(object):
            list: list of the blocks attribute.
        """
        attrs = []
-        for i in self.blocks_attr_ids(name):
+        for i in self._blocks_attr_ids(name):
            assert (i >= 0 and i < len(self.block.program.blocks))
            attrs.append(self.block.program.blocks[i])

        return attrs

-    def blocks_attr_ids(self, name):
+    def _blocks_attr_ids(self, name):
        """
        Get the blocks attribute's ids by name.

@@ -903,7 +901,7 @@ class Operator(object):
            list: list of the blocks ids.
        """

-        return self.desc.blocks_attr_ids(name)
+        return self.desc._blocks_attr_ids(name)

    def all_attrs(self):
        """
@@ -917,11 +915,11 @@ class Operator(object):
        for n in attr_names:
            attr_type = self.desc.attr_type(n)
            if attr_type == core.AttrType.BLOCK:
-                attr_map[n] = self.block_attr(n)
+                attr_map[n] = self._block_attr(n)
                continue

            if attr_type == core.AttrType.BLOCKS:
-                attr_map[n] = self.blocks_attr(n)
+                attr_map[n] = self._blocks_attr(n)
                continue

            attr_map[n] = self.attr(n)
@@ -1795,7 +1793,7 @@ class Program(object):
            for j in six.moves.range(block.op_size()):
                op = block.op(j)
                if op.has_attr('is_test'):
-                    op.set_attr('is_test', True)
+                    op._set_attr('is_test', True)
        res.blocks = [
            Block(res, i) for i in six.moves.range(res.desc.num_blocks())
        ]
@@ -2169,7 +2167,7 @@ def program_guard(main_program, startup_program=None):
        switch_startup_program(startup_program)


-def get_var(name, program=None):
+def _get_var(name, program=None):
    """
    Get a variable by name from the global block of a program.


--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -600,7 +600,7 @@ def save_inference_model(dirname,
    """
    if isinstance(feeded_var_names, six.string_types):
        feeded_var_names = [feeded_var_names]
-    else:
+    elif export_for_deployment:
        if len(feeded_var_names) > 0:
            # TODO(paddle-dev): polish these code blocks
            if not (bool(feeded_var_names) and all(
@@ -610,61 +610,60 @@ def save_inference_model(dirname,

    if isinstance(target_vars, Variable):
        target_vars = [target_vars]
-    else:
+    elif export_for_deployment:
        if not (bool(target_vars) and all(
                isinstance(var, Variable) for var in target_vars)):
            raise ValueError("'target_vars' should be a list of Variable.")

    if main_program is None:
        main_program = default_main_program()
-    copy_program = main_program.clone()
+
+    # if there is lookup table, the trainer 0 will notify all pserver to save.
+    if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
+        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
+        _save_lookup_tables_by_notify(executor, lookup_table_filename,
+                                      main_program._distributed_lookup_table,
+                                      main_program._endpoints)

    if not os.path.isdir(dirname):
        os.makedirs(dirname)
+    if model_filename is not None:
+        model_basename = os.path.basename(model_filename)
+    else:
+        model_basename = "__model__"
+    model_basename = os.path.join(dirname, model_basename)

    # When export_for_deployment is true, we modify the program online so that
    # it can only be loaded for inference directly. If it's false, the whole
    # original program and related meta are saved so that future usage can be
    # more flexible.
    if export_for_deployment:
-        global_block = copy_program.global_block()
+        main_program = main_program.clone()
+        global_block = main_program.global_block()
        for i, op in enumerate(global_block.ops):
            op.desc.set_is_target(False)
            if op.type == "feed" or op.type == "fetch":
                global_block._remove_op(i)
-        copy_program.desc.flush()
+        main_program.desc.flush()

-        pruned_program = copy_program._prune(targets=target_vars)
-        saved_program = pruned_program._inference_optimize(prune_read_op=True)
+        main_program = main_program._prune(targets=target_vars)
+        main_program = main_program._inference_optimize(prune_read_op=True)
        fetch_var_names = [v.name for v in target_vars]

-        prepend_feed_ops(saved_program, feeded_var_names)
-        append_fetch_ops(saved_program, fetch_var_names)
+        prepend_feed_ops(main_program, feeded_var_names)
+        append_fetch_ops(main_program, fetch_var_names)
+
+        with open(model_basename, "wb") as f:
+            f.write(main_program.desc.serialize_to_string())
    else:
        # TODO(panyx0718): Save more information so that it can also be used
        # for training and more flexible post-processing.
-        saved_program = copy_program
-
-    if model_filename is not None:
-        model_filename = os.path.basename(model_filename)
-    else:
-        model_filename = "__model__"
-    model_filename = os.path.join(dirname, model_filename)
+        with open(model_basename + ".main_program", "wb") as f:
+            f.write(main_program.desc.serialize_to_string())

    if params_filename is not None:
        params_filename = os.path.basename(params_filename)
-
-    with open(model_filename, "wb") as f:
-        f.write(saved_program.desc.serialize_to_string())
-
-    save_persistables(executor, dirname, saved_program, params_filename)
-
-    # if there is lookup table, the trainer 0 will notify all pserver to save.
-    if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
-        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
-        _save_lookup_tables_by_notify(executor, lookup_table_filename,
-                                      main_program._distributed_lookup_table,
-                                      main_program._endpoints)
+    save_persistables(executor, dirname, main_program, params_filename)


 def load_inference_model(dirname,

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -284,7 +284,7 @@ def detection_output(loc,
        target_box=loc,
        code_type='decode_center_size')
    compile_shape = scores.shape
-    run_shape = ops.shape(scores)
+    run_shape = nn.shape(scores)
    scores = nn.flatten(x=scores, axis=2)
    scores = nn.softmax(input=scores)
    scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape)
@@ -697,7 +697,7 @@ def ssd_loss(location,
        raise ValueError("Only support mining_type == max_negative now.")

    num, num_prior, num_class = confidence.shape
-    conf_shape = ops.shape(confidence)
+    conf_shape = nn.shape(confidence)

    def __reshape_to_2d(var):
        return nn.flatten(x=var, axis=2)
@@ -724,7 +724,7 @@ def ssd_loss(location,
    target_label.stop_gradient = True
    conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
    # 3. Mining hard examples
-    actual_shape = ops.slice(conf_shape, axes=[0], starts=[0], ends=[2])
+    actual_shape = nn.slice(conf_shape, axes=[0], starts=[0], ends=[2])
    actual_shape.stop_gradient = True
    conf_loss = nn.reshape(
        x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape)

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -507,7 +507,6 @@ def py_reader(capacity,

        1. The basic usage of :code:`py_reader` is as follows:

-        >>> import paddle.v2
        >>> import paddle.fluid as fluid
        >>> import paddle.dataset.mnist as mnist
        >>>
@@ -515,7 +514,7 @@ def py_reader(capacity,
        >>>                                 shapes=[(-1,3,224,224), (-1,1)],
        >>>                                 dtypes=['float32', 'int64'])
        >>> reader.decorate_paddle_reader(
-        >>>     paddle.v2.reader.shuffle(paddle.batch(mnist.train())
+        >>>     paddle.reader.shuffle(paddle.batch(mnist.train())
        >>>
        >>> img, label = fluid.layers.read_file(reader)
        >>> loss = network(img, label) # some network definition
@@ -534,7 +533,6 @@ def py_reader(capacity,
        2. When training and testing are both performed, two different
        :code:`py_reader` should be created with different names, e.g.:

-        >>> import paddle.v2
        >>> import paddle.fluid as fluid
        >>> import paddle.dataset.mnist as mnist
        >>>
@@ -548,7 +546,7 @@ def py_reader(capacity,
        >>>                                       dtypes=['float32', 'int64'],
        >>>                                       name='train_reader')
        >>> train_reader.decorate_paddle_reader(
-        >>>     paddle.v2.reader.shuffle(paddle.batch(mnist.train())
+        >>>     paddle.reader.shuffle(paddle.batch(mnist.train())
        >>>
        >>> test_reader = fluid.layers.py_reader(capacity=32,
        >>>                                      shapes=[(-1,3,224,224), (-1,1)],

--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -78,7 +78,12 @@ def accuracy(input, label, k=1, correct=None, total=None):
    return acc_out


-def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
+def auc(input,
+        label,
+        curve='ROC',
+        num_thresholds=2**12 - 1,
+        topk=1,
+        slide_steps=1):
    """
    **Area Under the Curve (AUC) Layer**

@@ -105,6 +110,8 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
        num_thresholds(int): The number of thresholds to use when discretizing
                             the roc curve. Default 200.
        topk(int): only topk number of prediction output will be used for auc.
+        slide_steps: when calc batch auc, we can not only use step currently but the previous steps can be used. slide_steps=1 means use the current step, slide_steps=3 means use current step and the previous second steps, slide_steps=0 use all of the steps.
+

    Returns:
        Variable: A scalar representing the current AUC.
@@ -120,16 +127,48 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
    auc_out = helper.create_tmp_variable(dtype="float64")
    batch_auc_out = helper.create_tmp_variable(dtype="float64")
    # make tp, tn, fp, fn persistable, so that can accumulate all batches.
+
+    # for batch auc
+    batch_stat_pos = helper.create_global_variable(
+        persistable=True,
+        dtype='int64',
+        shape=[slide_steps, num_thresholds + 1])
+    batch_stat_neg = helper.create_global_variable(
+        persistable=True,
+        dtype='int64',
+        shape=[slide_steps, num_thresholds + 1])
+
+    # for global auc
    stat_pos = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
    stat_neg = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1])

-    for var in [stat_pos, stat_neg]:
+    for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
        helper.set_variable_initializer(
            var, Constant(
                value=0.0, force_cpu=True))

+    # Batch AUC
+    helper.append_op(
+        type="auc",
+        inputs={
+            "Predict": [input],
+            "Label": [label],
+            "StatPos": [batch_stat_pos],
+            "StatNeg": [batch_stat_neg]
+        },
+        attrs={
+            "curve": curve,
+            "num_thresholds": num_thresholds,
+            "slide_steps": slide_steps
+        },
+        outputs={
+            "AUC": [batch_auc_out],
+            "StatPosOut": [batch_stat_pos],
+            "StatNegOut": [batch_stat_neg]
+        })
+    # Global AUC
    helper.append_op(
        type="auc",
        inputs={
@@ -138,12 +177,16 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
            "StatPos": [stat_pos],
            "StatNeg": [stat_neg]
        },
-        attrs={"curve": curve,
-               "num_thresholds": num_thresholds},
+        attrs={
+            "curve": curve,
+            "num_thresholds": num_thresholds,
+            "slide_steps": 0
+        },
        outputs={
            "AUC": [auc_out],
-            "BatchAUC": [batch_auc_out],
            "StatPosOut": [stat_pos],
            "StatNegOut": [stat_neg]
        })
-    return auc_out, batch_auc_out, [stat_pos, stat_neg]
+    return auc_out, batch_auc_out, [
+        batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
+    ]
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -45,13 +45,6 @@ __all__ = [
    'logical_or',
    'logical_xor',
    'logical_not',
-    'uniform_random_batch_size_like',
-    'gaussian_random',
-    'sampling_id',
-    'gaussian_random_batch_size_like',
-    'sum',
-    'slice',
-    'shape',
    'maxout',
 ]

@@ -63,6 +56,8 @@ for _OP in set(__all__):
 # e.g.: test_program_code.py, test_dist_train.py
 globals()['_scale'] = generate_layer_fn('scale')

+globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
+
 __all__ += __activations_noattr__

 for _OP in set(__activations_noattr__):

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -26,6 +26,7 @@ from .layer_helper import LayerHelper
 from .regularizer import append_regularization_ops
 from .clip import append_gradient_clip_ops, error_clip_callback
 from contextlib import contextmanager
+from .layers import ops

 __all__ = [
    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
@@ -1301,7 +1302,7 @@ class ModelAverage(Optimizer):
            x=tmp, dtype='float32' if self._dtype == None else self._dtype)
        sum = layers.cast(
            x=sum, dtype='float32' if self._dtype == None else self._dtype)
-        layers.elementwise_div(x=sum, y=tmp, out=param)
+        ops._elementwise_div(x=sum, y=tmp, out=param)

    def _add_average_restore_op(self, block, param_grad):
        param = block._clone_variable(param_grad[0])

--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -67,6 +67,7 @@ def train(nn_type,
          use_cuda,
          parallel,
          save_dirname=None,
+          save_full_dirname=None,
          model_filename=None,
          params_filename=None,
          is_local=True):
@@ -143,6 +144,13 @@ def train(nn_type,
                                exe,
                                model_filename=model_filename,
                                params_filename=params_filename)
+                        if save_full_dirname is not None:
+                            fluid.io.save_inference_model(
+                                save_full_dirname, [], [],
+                                exe,
+                                model_filename=model_filename,
+                                params_filename=params_filename,
+                                export_for_deployment=False)
                        return
                    else:
                        print(
@@ -214,10 +222,12 @@ def infer(use_cuda,

 def main(use_cuda, parallel, nn_type, combine):
    save_dirname = None
+    save_full_dirname = None
    model_filename = None
    params_filename = None
    if not use_cuda and not parallel:
        save_dirname = "recognize_digits_" + nn_type + ".inference.model"
+        save_full_dirname = "recognize_digits_" + nn_type + ".train.model"
        if combine == True:
            model_filename = "__model_combined__"
            params_filename = "__params_combined__"
@@ -228,6 +238,7 @@ def main(use_cuda, parallel, nn_type, combine):
        use_cuda=use_cuda,
        parallel=parallel,
        save_dirname=save_dirname,
+        save_full_dirname=save_full_dirname,
        model_filename=model_filename,
        params_filename=params_filename)
    infer(

--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -47,7 +47,7 @@ def cnn_model(data):
        pool_stride=2,
        act="relu",
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.3)))
+            value=0.01)))
    conv_pool_2 = fluid.nets.simple_img_conv_pool(
        input=conv_pool_1,
        filter_size=5,
@@ -56,7 +56,7 @@ def cnn_model(data):
        pool_stride=2,
        act="relu",
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.2)))
+            value=0.01)))

    SIZE = 10
    input_shape = conv_pool_2.shape
@@ -68,7 +68,7 @@ def cnn_model(data):
        size=SIZE,
        act="softmax",
        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.1)))
+            initializer=fluid.initializer.Constant(value=0.01)))
    return predict



--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -1488,7 +1488,7 @@ def wrap_decoder(trg_vocab_size,
    if weight_sharing:
        predict = layers.matmul(
            x=dec_output,
-            y=fluid.get_var(word_emb_param_names[0]),
+            y=fluid.framework._get_var(word_emb_param_names[0]),
            transpose_y=True)
    else:
        predict = layers.fc(input=dec_output,
@@ -1699,10 +1699,9 @@ class DistTransformer2x2(TestDistRunnerBase):
        exe.run(startup_prog)
        exe.run(pserver_prog)

-    def run_trainer(self, use_cuda, args):
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        TrainTaskConfig.use_gpu = use_cuda
-        sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model(
+    def run_trainer(self, args):
+        TrainTaskConfig.use_gpu = args.use_cuda
+        sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
            args.is_dist, not args.sync_mode)

        if args.is_dist:
@@ -1718,6 +1717,11 @@ class DistTransformer2x2(TestDistRunnerBase):
            TrainTaskConfig.batch_size = 20
            trainer_prog = fluid.default_main_program()

+        if args.use_cuda:
+            place = fluid.CUDAPlace(0)
+        else:
+            place = fluid.CPUPlace()
+
        startup_exe = fluid.Executor(place)

        TrainTaskConfig.local = not args.is_dist

--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
@@ -122,4 +122,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase):


 if __name__ == "__main__":
+    import os
+    os.environ['CPU_NUM'] = '1'
+    os.environ['USE_CUDA'] = "FALSE"
    runtime_main(TestDistWord2vec2x2)
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -345,7 +345,7 @@ class OpTest(unittest.TestCase):
                        actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                    "Output (" + out_name + ") has diff at " + str(place) +
                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
-                    str(actual_t))
+                    str(actual_t) + " in class " + self.__class__.__name__)
                if isinstance(expect, tuple):
                    self.assertListEqual(actual.recursive_sequence_lengths(),
                                         expect[1], "Output (" + out_name +

--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py