diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 0f9521616952a2857222feab8c38fb480761ee2d..a777a4974cc377db103a470698f817612a4e9a32 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -1,11 +1,9 @@
 add_custom_target(paddle_apis ALL
-                  DEPENDS paddle_v2_apis paddle_fluid_apis)
+                  DEPENDS paddle_v2_apis)
 
 add_custom_target(paddle_docs ALL
                   DEPENDS paddle_v2_docs paddle_v2_docs_cn
-                  paddle_fluid_docs paddle_fluid_docs_cn
                   paddle_mobile_docs paddle_mobile_docs_cn)
 
 add_subdirectory(v2)
-add_subdirectory(fluid)
 add_subdirectory(mobile)
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 5e08c97746d595775b82008360c6b4907ddb7b21..9feecc8f8fee6172663d859976dcbce709c8ada8 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -153,6 +153,13 @@ paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn'
 paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
 paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
 paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
+paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0))
+paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32', False))
+paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32'))
+paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32'))
+paddle.fluid.layers.sum ArgSpec(args=['x', 'use_mkldnn'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -224,13 +231,6 @@ paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwarg
 paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.sampling_id ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -269,7 +269,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
-paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
+paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -298,6 +298,7 @@ paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs
 paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
 paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 6d8cbe5d9e491555a94e9420995149041213ab79..69c6dd02005902d9ee486a0b7f8f0c9a343be255 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -150,11 +150,10 @@ else()
 endif()
  
 if (NOT WIN32)
-  cc_library(parallel_executor SRCS parallel_executor.cc DEPS
-          threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
-          graph graph_viz_pass multi_devices_graph_pass
-          multi_devices_graph_print_pass multi_devices_graph_check_pass
-          fast_threaded_ssa_graph_executor fuse_elewise_add_act_pass)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS
+        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
+        graph build_strategy
+        fast_threaded_ssa_graph_executor)
 endif() # NOT WIN32
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index a8e0c4a3fedfd56e38de7568be6b3f2e76a4b25f..e0a3ef5a9c6c53c42ebea1a41cac0d18a77781b2 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -54,3 +54,8 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
 #        device_context reduce_op_handle )
 cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
         DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
+
+cc_library(build_strategy SRCS build_strategy.cc DEPS
+        graph_viz_pass multi_devices_graph_pass
+        multi_devices_graph_print_pass multi_devices_graph_check_pass
+        fuse_elewise_add_act_pass)
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6a6b497fa897e3882995688bf36704b1d77ea962
--- /dev/null
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -0,0 +1,126 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/details/build_strategy.h"
+
+#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ParallelExecutorPassBuilder : public ir::PassBuilder {
+ public:
+  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
+      : ir::PassBuilder(), strategy_(strategy) {
+    // Add a graph viz pass to record a graph.
+    if (!strategy_.debug_graphviz_path_.empty()) {
+      auto viz_pass = AppendPass("graph_viz_pass");
+      const std::string graph_path = string::Sprintf(
+          "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
+      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    }
+
+    // Add op fusion.
+    if (strategy.fuse_elewise_add_act_ops_) {
+      auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
+      // Add a graph viz pass to record a graph.
+      if (!strategy.debug_graphviz_path_.empty()) {
+        auto viz_pass = AppendPass("graph_viz_pass");
+        const std::string graph_path = string::Sprintf(
+            "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
+        viz_pass->Set<std::string>("graph_viz_path",
+                                   new std::string(graph_path));
+      }
+    }
+
+    // Convert graph to run on multi-devices.
+    auto multi_devices_pass = AppendPass("multi_devices_pass");
+    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
+                                                         &strategy_);
+
+    // Add a graph print pass to record a graph with device info.
+    if (!strategy_.debug_graphviz_path_.empty()) {
+      auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
+      multi_devices_print_pass->SetNotOwned<const std::string>(
+          "debug_graphviz_path", &strategy_.debug_graphviz_path_);
+      multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
+          "graph_printer", new details::GraphvizSSAGraphPrinter);
+    }
+
+    // Verify that the graph is correct for multi-device executor.
+    AppendPass("multi_devices_check_pass");
+  }
+
+ private:
+  BuildStrategy strategy_;
+};
+
+std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy()
+    const {
+  pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
+  return pass_builder_;
+}
+
+std::unique_ptr<ir::Graph> BuildStrategy::Apply(
+    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &param_names,
+    const std::vector<Scope *> &local_scopes,
+#ifdef PADDLE_WITH_CUDA
+    const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
+#else
+    const bool use_cuda) const {
+#endif
+  // Create a default one if not initialized by user.
+  if (!pass_builder_) {
+    CreatePassesFromStrategy();
+  }
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
+
+  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
+    if (pass->Type() == "multi_devices_pass") {
+      pass->Erase("places");
+      pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
+      pass->Erase("loss_var_name");
+      pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
+      pass->Erase("params");
+      pass->SetNotOwned<const std::unordered_set<std::string>>("params",
+                                                               &param_names);
+      pass->Erase("local_scopes");
+      pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+                                                    &local_scopes);
+#ifdef PADDLE_WITH_CUDA
+      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+      pass->Erase("nccl_ctxs");
+      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
+#endif
+    }
+    graph = pass->Apply(std::move(graph));
+  }
+  return graph;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(fuse_elewise_add_act_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(multi_devices_pass);
+USE_PASS(multi_devices_check_pass);
+USE_PASS(multi_devices_print_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 77cafa49f18158ea4187908856e10adaa0f916c9..02c4bea16916d58a6d0fce8918f8fceb9ff9356e 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -15,6 +15,17 @@
 #pragma once
 
 #include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/pass_builder.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -57,6 +68,30 @@ struct BuildStrategy {
   bool fuse_elewise_add_act_ops_{false};
 
   bool enable_data_balance_{false};
+
+  // User normally doesn't need to call this API.
+  // The PassBuilder allows for more customized insert, remove of passes
+  // from python side.
+  // A new PassBuilder is created based on configs defined above and
+  // passes are owned by the PassBuilder.
+  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const;
+
+  // Apply the passes built by the pass_builder_. The passes will be
+  // applied to the Program and output an ir::Graph.
+  std::unique_ptr<ir::Graph> Apply(
+      const ProgramDesc &main_program,
+      const std::vector<platform::Place> &places,
+      const std::string &loss_var_name,
+      const std::unordered_set<std::string> &param_names,
+      const std::vector<Scope *> &local_scopes,
+#ifdef PADDLE_WITH_CUDA
+      const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const;
+#else
+      const bool use_cuda) const;
+#endif
+
+ private:
+  mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h
index 21f75957be5f33f3dfc09c41fa9a1e1ca590f99e..090517ff3c1822c2e62e61fad05d49e1c8db8573 100644
--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
@@ -20,79 +20,37 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-// Change it to thread safe flags if needed.
-class ThreadUnsafeOwnershipFlags {
+template <class T>
+class COWPtr {
  public:
-  explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
-
-  ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
-  ThreadUnsafeOwnershipFlags& operator=(
-      const ThreadUnsafeOwnershipFlags& other) = delete;
-  ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
-
-  void SetOwnership(bool flag) { flag_ = flag; }
-
-  // Invoke the callback if it is not owned.
-  template <typename Callback>
-  void AcquireOwnershipOnce(Callback acquire) {
-    if (!flag_) {
-      acquire();
-      flag_ = true;
-    }
-  }
+  typedef std::shared_ptr<T> RefPtr;
 
  private:
-  bool flag_;
-};
+  RefPtr m_sp;
 
-// Copy-On-Write pointer.
-// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
-//
-// The template parameter OwnershipFlags should have:
-//   * a constructor takes a bool. True if own.
-//   * SetOwnership(bool flag).
-//   * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
-//     owned.
-//
-// https://en.wikipedia.org/wiki/Copy-on-write
-template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
-class COWPtr {
  public:
-  // Ctor from raw pointer.
-  explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
+  COWPtr() : m_sp(nullptr) {}
+  explicit COWPtr(T* t) : m_sp(t) {}
 
-  // Move methods. Steal ownership from origin
-  COWPtr(COWPtr&& other)
-      : payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
-  COWPtr& operator=(COWPtr&& origin) = default;
+  const T& Data() const { return *m_sp; }
 
-  // Copy methods. Not own payload
-  COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
-  COWPtr& operator=(const COWPtr& other) {
-    payload_ = other.payload_;
-    ownership_.SetOwnership(false);
-    return *this;
-  }
-
-  // Access read only data.
-  const T& Data() const { return *payload_; }
-
-  // Access mutable data. If the data is not owned, the data will be copied
-  // before.
   T* MutableData() {
-    ownership_.AcquireOwnershipOnce(
-        [this] { payload_.reset(new T(*payload_)); });
-    return payload_.get();
+    DetachIfNotUnique();
+    return m_sp.get();
   }
 
- private:
-  // Actual data pointer.
-  std::shared_ptr<T> payload_;
+  void DetachIfNotUnique() {
+    T* tmp = m_sp.get();
+    if (!(tmp == nullptr || m_sp.unique())) {
+      Detach();
+    }
+  }
 
-  // Ownership flag.
-  OwnershipFlags ownership_;
+  void Detach() {
+    T* tmp = m_sp.get();
+    m_sp = RefPtr(new T(*tmp));
+  }
 };
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc
index d2142af277c0b356d83941b3baab1947cce31dac..5b055d7cb4d127dc20f2cf70869134f24a93d429 100644
--- a/paddle/fluid/framework/details/cow_ptr_test.cc
+++ b/paddle/fluid/framework/details/cow_ptr_test.cc
@@ -30,6 +30,14 @@ TEST(COWPtr, all) {
   ASSERT_EQ(ptr2.Data(), 10);
 }
 
+TEST(COWPtr, change_old) {
+  COWPtr<int> ptr(new int{0});
+  COWPtr<int> ptr2 = ptr;
+  *ptr.MutableData() = 10;
+  ASSERT_EQ(ptr2.Data(), 0);
+  ASSERT_EQ(ptr.Data(), 10);
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 4dca3ceb4569fb708c7a98621c5239acbe217586..9796f2778955ce47fe9fd1b13baa36404b1574c5 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -41,6 +41,8 @@ cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass
 
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
 
+cc_library(pass_builder SRCS pass_builder.cc DEPS pass)
+
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index bb52d7e498e55c02ddc2cd6d07ccccd51ce4edc5..1c75cb5a82029b6a542a3a2f031a353f5e40f4ea 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -257,6 +257,22 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   PDPattern external_pattern, subblock_pattern;
 
+  // Use the following variables to tell whether this model is RNN1.
+  // This fuse can only works on the RNN1 model.
+  std::unordered_set<std::string> specified_vars({"data_lod_attention",
+                                                  "cell_init", "hidden_init",
+                                                  "data", "week", "minute"});
+  int count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsVar() && specified_vars.count(node->Name())) {
+      ++count;
+    }
+  }
+  if (count < specified_vars.size()) {
+    return graph;
+  }
+
+  // Continue to fuse.
   FindWhileOp(graph.get());
   return graph;
 }
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index aa95d3e9f6c8221f6e48d192b73ad5135539dc75..f5c286486520391906a6cd7545041c8a7df614ea 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -77,10 +77,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
     const std::string BatchedCellPreAct =
         patterns::UniqueKey("BatchedCellPreAct");
     const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
+    const std::string CheckedCell = patterns::UniqueKey("CheckedCell");
 
     scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
     scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
     scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
+    scope->Var(CheckedCell)->GetMutable<framework::LoDTensor>();
 
     op_desc.SetInput("H0", {});
     op_desc.SetInput("C0", {});
@@ -90,6 +92,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
     op_desc.SetOutput("BatchedGate", {BatchedGate});
     op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
     op_desc.SetOutput("BatchedInput", {BatchedInput});
+    op_desc.SetOutput("CheckedCell", {CheckedCell});
     op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
     op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
     // TODO(TJ): get from attr
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index d7158eba62686be57499df697466797e4034ea8f..6cf405efe63d2bc284c4650771a747b27bb4a9f6 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -19,7 +19,6 @@ namespace paddle {
 namespace framework {
 namespace ir {
 std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
-  PADDLE_ENFORCE(!applied_, "Pass can only Apply() once.");
   PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
   for (const std::string& attr : required_pass_attrs_) {
     PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 0f14083d259172f5b5f1ed80c7d38312d711beb5..9570c59cff2a6afeb1c607f7219b7b455974d6ce 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -42,6 +42,8 @@ class Pass {
     attr_dels_.clear();
   }
 
+  std::string Type() const { return type_; }
+
   std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const;
 
   // Get a reference to the attributed previously set.
@@ -52,6 +54,21 @@ class Pass {
     return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
   }
 
+  bool Has(const std::string &attr_name) const {
+    return attrs_.find(attr_name) != attrs_.end();
+  }
+
+  void Erase(const std::string &attr_name) {
+    if (!Has(attr_name)) {
+      return;
+    }
+    if (attr_dels_.find(attr_name) != attr_dels_.end()) {
+      attr_dels_[attr_name]();
+      attr_dels_.erase(attr_name);
+    }
+    attrs_.erase(attr_name);
+  }
+
   // Set a pointer to the attribute. Pass takes ownership of the attribute.
   template <typename AttrType>
   void Set(const std::string &attr_name, AttrType *attr) {
@@ -68,13 +85,15 @@ class Pass {
   // should delete the attribute.
   template <typename AttrType>
   void SetNotOwned(const std::string &attr_name, AttrType *attr) {
-    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
+                   attr_name);
     attrs_[attr_name] = attr;
   }
 
  protected:
-  virtual std::unique_ptr<Graph> ApplyImpl(
-      std::unique_ptr<Graph> graph) const = 0;
+  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
+    LOG(FATAL) << "Calling virtual Pass not implemented.";
+  }
 
  private:
   template <typename PassType>
@@ -89,7 +108,10 @@ class Pass {
     required_graph_attrs_.insert(attrs.begin(), attrs.end());
   }
 
+  void RegisterType(const std::string &type) { type_ = type; }
+
   mutable bool applied_{false};
+  std::string type_;
   std::unordered_set<std::string> required_pass_attrs_;
   std::unordered_set<std::string> required_graph_attrs_;
   std::map<std::string, boost::any> attrs_;
@@ -143,10 +165,11 @@ struct PassRegistrar : public Registrar {
     PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
                    "'%s' is registered more than once.", pass_type);
     PassRegistry::Instance().Insert(
-        pass_type, [this]() -> std::unique_ptr<Pass> {
+        pass_type, [this, pass_type]() -> std::unique_ptr<Pass> {
           std::unique_ptr<Pass> pass(new PassType());
           pass->RegisterRequiredPassAttrs(this->required_pass_attrs_);
           pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_);
+          pass->RegisterType(pass_type);
           return pass;
         });
   }
diff --git a/paddle/fluid/framework/ir/pass_builder.cc b/paddle/fluid/framework/ir/pass_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0719867b34d13666672b22070ce14dbaf80d85d
--- /dev/null
+++ b/paddle/fluid/framework/ir/pass_builder.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/pass_builder.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::shared_ptr<Pass> PassBuilder::AppendPass(const std::string& pass_type) {
+  auto pass = ir::PassRegistry::Instance().Get(pass_type);
+  passes_.emplace_back(pass.release());
+  return passes_.back();
+}
+
+void PassBuilder::RemovePass(size_t idx) {
+  PADDLE_ENFORCE(passes_.size() > idx);
+  passes_.erase(passes_.begin() + idx);
+}
+
+std::shared_ptr<Pass> PassBuilder::InsertPass(size_t idx,
+                                              const std::string& pass_type) {
+  PADDLE_ENFORCE(passes_.size() >= idx);
+  std::shared_ptr<Pass> pass(
+      ir::PassRegistry::Instance().Get(pass_type).release());
+  passes_.insert(passes_.begin() + idx, std::move(pass));
+  return passes_[idx];
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass_builder.h b/paddle/fluid/framework/ir/pass_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..733d3a3ad1ab8989ea30fe45cd7e1ffe9432de13
--- /dev/null
+++ b/paddle/fluid/framework/ir/pass_builder.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class PassBuilder {
+ public:
+  PassBuilder() {}
+
+  virtual ~PassBuilder() {}
+
+  // Append a new pass to the end.
+  std::shared_ptr<Pass> AppendPass(const std::string& pass_type);
+
+  // Insert a new pass after `idx`.
+  std::shared_ptr<Pass> InsertPass(size_t idx, const std::string& pass_type);
+
+  // Remove a new pass at `idx`.
+  void RemovePass(size_t idx);
+
+  // Returns a list of all passes.
+  std::vector<std::shared_ptr<Pass>> AllPasses() const { return passes_; }
+
+ protected:
+  std::vector<std::shared_ptr<Pass>> passes_;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc
index 5b5011412ed39e033a7a65921e9c64ce2d54c638..6ad7d1df8bdd016b617c820c022ef55f23ba21cd 100644
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -82,12 +82,10 @@ TEST(PassTest, TestPassAttrCheck) {
   ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
   ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);
 
-  try {
-    graph = pass->Apply(std::move(graph));
-  } catch (paddle::platform::EnforceNotMet e) {
-    exception = std::string(e.what());
-  }
-  ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos);
+  // Allow apply more than once.
+  graph.reset(new Graph(prog));
+  graph->Set<int>("test_graph_attr", new int);
+  graph = pass->Apply(std::move(graph));
 
   pass = PassRegistry::Instance().Get("test_pass");
   pass->SetNotOwned<int>("test_pass_attr", &val);
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 7836ecb1272a07a79a70c9cb040335f9a42e5684..77386f4f069489b6ff7b927a281bdc286ff816e0 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -17,10 +17,13 @@
 #include <algorithm>
 #include <initializer_list>
 #include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
 #include <vector>
-
+#include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/memcpy.h"
 
 #include "glog/logging.h"
 
@@ -28,206 +31,436 @@ namespace paddle {
 namespace framework {
 
 #if defined(PADDLE_WITH_CUDA)
+namespace details {
+struct CUDABuffer {
+  void *data_{nullptr};
+  size_t size_{0};
+  platform::CUDAPlace place_;
+
+  CUDABuffer() {}
+  CUDABuffer(platform::Place place, size_t size)
+      : size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
+    data_ = memory::Alloc(place_, size);
+  }
+
+  ~CUDABuffer() { ClearMemory(); }
+
+  CUDABuffer(const CUDABuffer &o) = delete;
+  CUDABuffer &operator=(const CUDABuffer &o) = delete;
+
+  void Resize(platform::Place place, size_t size) {
+    ClearMemory();
+    place_ = boost::get<platform::CUDAPlace>(place);
+    data_ = memory::Alloc(place_, size);
+    PADDLE_ENFORCE_NOT_NULL(data_);
+    size_ = size;
+  }
+
+  void Swap(CUDABuffer &o) {
+    std::swap(data_, o.data_);
+    std::swap(place_, o.place_);
+    std::swap(size_, o.size_);
+  }
+
+ private:
+  void ClearMemory() const {
+    if (data_ != nullptr) {
+      memory::Free(place_, data_);
+    }
+  }
+};
+}  // namespace details
+
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
 class Vector {
  public:
   using value_type = T;
+  using iterator = typename std::vector<T>::iterator;
+  using const_iterator = typename std::vector<T>::const_iterator;
 
-  // Default ctor. Create empty Vector
-  Vector() { InitEmpty(); }
+ private:
+  // The actual class to implement vector logic
+  class VectorData {
+   public:
+    VectorData() : flag_(kDataInCPU) {}
+    VectorData(size_t count, const T &value)
+        : cpu_(count, value), flag_(kDataInCPU) {}
+    VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
+    template <typename U>
+    explicit VectorData(const std::vector<U> &dat)
+        : cpu_(dat), flag_(kDataInCPU) {}
+    ~VectorData() {}
+
+    VectorData(const VectorData &o) {
+      o.ImmutableCPU();
+      cpu_ = o.cpu_;
+      flag_ = kDataInCPU;
+    }
 
-  // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T &value = T()) {
-    InitEmpty();
-    if (count != 0) {
-      resize(count);
-      T *ptr = begin();
-      for (size_t i = 0; i < count; ++i) {
-        ptr[i] = value;
+    VectorData &operator=(const VectorData &o) {
+      o.ImmutableCPU();
+      cpu_ = o.cpu_;
+      flag_ = kDataInCPU;
+      details::CUDABuffer null;
+      gpu_.Swap(null);
+      return *this;
+    }
+
+    T &operator[](size_t i) {
+      MutableCPU();
+      return cpu_[i];
+    }
+
+    const T &operator[](size_t i) const {
+      ImmutableCPU();
+      return cpu_[i];
+    }
+
+    size_t size() const { return cpu_.size(); }
+
+    iterator begin() {
+      MutableCPU();
+      return cpu_.begin();
+    }
+
+    iterator end() {
+      MutableCPU();
+      return cpu_.end();
+    }
+
+    T &front() {
+      MutableCPU();
+      return cpu_.front();
+    }
+
+    T &back() {
+      MutableCPU();
+      return cpu_.back();
+    }
+
+    const_iterator begin() const {
+      ImmutableCPU();
+      return cpu_.begin();
+    }
+
+    const_iterator end() const {
+      ImmutableCPU();
+      return cpu_.end();
+    }
+
+    const T &back() const {
+      ImmutableCPU();
+      return cpu_.back();
+    }
+
+    T *data() { return &(*this)[0]; }
+
+    const T *data() const { return &(*this)[0]; }
+
+    const T &front() const {
+      ImmutableCPU();
+      return cpu_.front();
+    }
+
+    // assign this from iterator.
+    // NOTE: the iterator must support `end-begin`
+    template <typename Iter>
+    void assign(Iter begin, Iter end) {
+      MutableCPU();
+      cpu_.assign(begin, end);
+    }
+
+    // push_back. If the previous capacity is not enough, the memory will
+    // double.
+    void push_back(T elem) {
+      MutableCPU();
+      cpu_.push_back(elem);
+    }
+
+    // extend a vector by iterator.
+    // NOTE: the iterator must support end-begin
+    template <typename It>
+    void Extend(It begin, It end) {
+      MutableCPU();
+      auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
+      std::copy(begin, end, out_it);
+    }
+
+    // resize the vector
+    void resize(size_t size) {
+      MutableCPU();
+      cpu_.resize(size);
+    }
+
+    // get cuda ptr. immutable
+    const T *CUDAData(platform::Place place) const {
+      PADDLE_ENFORCE(platform::is_gpu_place(place),
+                     "CUDA Data must on CUDA place");
+      ImmutableCUDA(place);
+      return reinterpret_cast<T *>(gpu_.data_);
+    }
+
+    // get cuda ptr. mutable
+    T *CUDAMutableData(platform::Place place) {
+      const T *ptr = CUDAData(place);
+      flag_ = kDirty | kDataInCUDA;
+      return const_cast<T *>(ptr);
+    }
+
+    // clear
+    void clear() {
+      cpu_.clear();
+      flag_ = kDirty | kDataInCPU;
+    }
+
+    size_t capacity() const { return cpu_.capacity(); }
+
+    // reserve data
+    void reserve(size_t size) const { cpu_.reserve(size); }
+
+    // implicit cast operator. Vector can be cast to std::vector implicitly.
+    operator std::vector<T>() const {
+      ImmutableCPU();
+      return cpu_;
+    }
+
+    bool operator==(const VectorData &other) const {
+      ImmutableCPU();
+      other.ImmutableCPU();
+      return cpu_ == other.cpu_;
+    }
+
+    std::mutex &Mutex() const { return mtx_; }
+
+    std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
+      if (gpu_.data_ == nullptr) {
+        return nullptr;
+      } else {
+        return std::unique_ptr<platform::CUDAPlace>(
+            new platform::CUDAPlace(gpu_.place_));
       }
     }
-  }
 
-  // Ctor with init_list
-  Vector(std::initializer_list<T> init) {
-    if (init.size() == 0) {
-      InitEmpty();
-    } else {
-      InitByIter(init.size(), init.begin(), init.end());
+   private:
+    enum DataFlag {
+      kDataInCPU = 0x01,
+      kDataInCUDA = 0x02,
+      // kDirty means the data has been changed in one device.
+      kDirty = 0x10
+    };
+
+    void CopyToCPU() const {
+      // COPY GPU Data To CPU
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(
+              platform::Place(gpu_.place_)));
+      auto stream = dev_ctx->stream();
+      void *src = gpu_.data_;
+      void *dst = cpu_.data();
+      memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
+                   stream);
+      dev_ctx->Wait();
+    }
+
+    void MutableCPU() {
+      if (IsInCUDA() && IsDirty()) {
+        CopyToCPU();
+      }
+      flag_ = kDirty | kDataInCPU;
     }
-  }
+
+    void ImmutableCUDA(platform::Place place) const {
+      if (IsDirty()) {
+        if (IsInCPU()) {
+          CopyCPUDataToCUDA(place);
+          UnsetFlag(kDirty);
+          SetFlag(kDataInCUDA);
+        } else if (IsInCUDA() &&
+                   !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
+          PADDLE_THROW("This situation should not happen");
+          // Still dirty
+        } else {
+          // Dirty && DataInCUDA && Device is same
+          // Do nothing
+        }
+      } else {
+        if (!IsInCUDA()) {
+          // Even data is not dirty. However, data is not in CUDA. Copy data.
+          CopyCPUDataToCUDA(place);
+          SetFlag(kDataInCUDA);
+        } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
+          PADDLE_THROW("This situation should not happen.");
+        } else {
+          // Not Dirty && DataInCUDA && Device is same
+          // Do nothing.
+        }
+      }
+    }
+
+    void CopyCPUDataToCUDA(const platform::Place &place) const {
+      void *src = cpu_.data();
+      gpu_.Resize(place, cpu_.size() * sizeof(T));
+      void *dst = gpu_.data_;
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(place));
+      auto stream = dev_ctx->stream();
+      memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
+                   stream);
+    }
+
+    void ImmutableCPU() const {
+      if (IsDirty() && !IsInCPU()) {  // If data has been changed in CUDA, or
+                                      // CPU has no data.
+        CopyToCPU();
+        UnsetFlag(kDirty);
+      }
+      SetFlag(kDataInCPU);
+    }
+
+    void UnsetFlag(int flag) const { flag_ &= ~flag; }
+    void SetFlag(int flag) const { flag_ |= flag; }
+
+    bool IsDirty() const { return flag_ & kDirty; }
+
+    bool IsInCUDA() const { return flag_ & kDataInCUDA; }
+
+    bool IsInCPU() const { return flag_ & kDataInCPU; }
+
+    mutable std::vector<T> cpu_;
+    mutable details::CUDABuffer gpu_;
+    mutable int flag_;
+
+    mutable std::mutex mtx_;
+  };
+
+ public:
+  // Default ctor. Create empty Vector
+  Vector() : m_(new VectorData()) {}
+
+  // Fill vector with value. The vector size is `count`.
+  explicit Vector(size_t count, const T &value = T())
+      : m_(new VectorData(count, value)) {}
+
+  // Ctor with init_list
+  Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}
 
   // implicit cast from std::vector.
   template <typename U>
-  Vector(const std::vector<U> &dat) {  // NOLINT
-    if (dat.size() == 0) {
-      InitEmpty();
-    } else {
-      InitByIter(dat.size(), dat.begin(), dat.end());
-    }
+  Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) {  // NOLINT
   }
 
   // Copy ctor
-  Vector(const Vector<T> &other) { this->operator=(other); }
+  Vector(const Vector<T> &other) { m_ = other.m_; }
 
   // Copy operator
   Vector<T> &operator=(const Vector<T> &other) {
-    if (other.size() != 0) {
-      this->InitByIter(other.size(), other.begin(), other.end());
-    } else {
-      InitEmpty();
-    }
+    m_ = other.m_;
     return *this;
   }
 
   // Move ctor
-  Vector(Vector<T> &&other) {
-    this->size_ = other.size_;
-    this->flag_ = other.flag_;
-    if (other.cuda_vec_.memory_size()) {
-      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
-    }
-    if (other.cpu_vec_.memory_size()) {
-      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
-    }
-  }
+  Vector(Vector<T> &&other) { m_ = std::move(other.m_); }
 
   // CPU data access method. Mutable.
-  T &operator[](size_t i) {
-    MutableCPU();
-    return const_cast<T *>(cpu_vec_.data<T>())[i];
-  }
+  T &operator[](size_t i) { return (*m_.MutableData())[i]; }
 
   // CPU data access method. Immutable.
-  const T &operator[](size_t i) const {
-    ImmutableCPU();
-    return cpu_vec_.data<T>()[i];
-  }
+  const T &operator[](size_t i) const { return m_.Data()[i]; }
 
   // std::vector iterator methods. Based on CPU data access method
-  size_t size() const { return size_; }
+  size_t size() const { return m_.Data().size(); }
 
-  T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+  iterator begin() { return m_.MutableData()->begin(); }
 
-  T *end() {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
-  }
+  iterator end() { return m_.MutableData()->end(); }
 
-  T &front() { return *begin(); }
+  T &front() { return m_.MutableData()->front(); }
 
-  T &back() {
-    auto it = end();
-    --it;
-    return *it;
-  }
+  T &back() { return m_.MutableData()->back(); }
 
-  const T *begin() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
-  }
+  const_iterator begin() const { return m_.Data().begin(); }
 
-  const T *end() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
-  }
+  const_iterator end() const { return m_.Data().end(); }
 
-  const T *cbegin() const { return begin(); }
+  const_iterator cbegin() const { return begin(); }
 
-  const T *cend() const { return end(); }
+  const_iterator cend() const { return end(); }
 
-  const T &back() const {
-    auto it = end();
-    --it;
-    return *it;
-  }
+  const T &back() const { return m_.Data().back(); }
 
-  T *data() { return begin(); }
+  T *data() { return m_.MutableData()->data(); }
 
-  const T *data() const { return begin(); }
+  const T *data() const { return m_.Data().data(); }
 
-  const T &front() const { return *begin(); }
+  const T &front() const { return m_.Data().front(); }
   // end of std::vector iterator methods
 
   // assign this from iterator.
   // NOTE: the iterator must support `end-begin`
   template <typename Iter>
   void assign(Iter begin, Iter end) {
-    InitByIter(end - begin, begin, end);
+    m_.MutableData()->assign(begin, end);
   }
 
   // push_back. If the previous capacity is not enough, the memory will
   // double.
-  void push_back(T elem) {
-    if (size_ + 1 > capacity()) {
-      reserve((size_ + 1) << 1);
-    }
-    *end() = elem;
-    ++size_;
-  }
+  void push_back(T elem) { m_.MutableData()->push_back(elem); }
 
   // extend a vector by iterator.
   // NOTE: the iterator must support end-begin
   template <typename It>
   void Extend(It begin, It end) {
-    size_t pre_size = size_;
-    resize(pre_size + (end - begin));
-    T *ptr = this->begin() + pre_size;
-    for (; begin < end; ++begin, ++ptr) {
-      *ptr = *begin;
-    }
+    m_.MutableData()->Extend(begin, end);
   }
 
   // resize the vector
   void resize(size_t size) {
-    if (size + 1 <= capacity()) {
-      size_ = size;
-    } else {
-      MutableCPU();
-      Tensor cpu_tensor;
-      platform::Place cpu = platform::CPUPlace();
-      T *ptr = cpu_tensor.mutable_data<T>(
-          framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-      const T *old_ptr =
-          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
-      if (old_ptr != nullptr) {
-        std::copy(old_ptr, old_ptr + size_, ptr);
-      }
-      size_ = size;
-      cpu_vec_.ShareDataWith(cpu_tensor);
+    if (m_.Data().size() != size) {
+      m_.MutableData()->resize(size);
     }
   }
 
   // get cuda ptr. immutable
   const T *CUDAData(platform::Place place) const {
-    PADDLE_ENFORCE(platform::is_gpu_place(place),
-                   "CUDA Data must on CUDA place");
-    ImmutableCUDA(place);
-    return cuda_vec_.data<T>();
+    {
+      auto &mtx = m_.Data().Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_.Data().CUDAPlace();
+      if (cuda_place == nullptr ||
+          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+        return m_.Data().CUDAData(place);
+      }
+    }
+    // If m_ contains CUDAData in a different place. Detach manually.
+    m_.Detach();
+    return CUDAData(place);
   }
 
   // get cuda ptr. mutable
   T *CUDAMutableData(platform::Place place) {
-    const T *ptr = CUDAData(place);
-    flag_ = kDirty | kDataInCUDA;
-    return const_cast<T *>(ptr);
+    {
+      auto &mtx = m_.Data().Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_.Data().CUDAPlace();
+      if (cuda_place == nullptr ||
+          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+        return m_.MutableData()->CUDAMutableData(place);
+      }
+    }
+    // If m_ contains CUDAData in a different place. Detach manually.
+    m_.Detach();
+    return CUDAMutableData(place);
   }
 
   // clear
-  void clear() {
-    size_ = 0;
-    flag_ = kDirty | kDataInCPU;
-  }
+  void clear() { m_.MutableData()->clear(); }
 
-  size_t capacity() const {
-    return cpu_vec_.memory_size() / SizeOfType(typeid(T));
-  }
+  size_t capacity() const { return m_.Data().capacity(); }
 
   // reserve data
-  void reserve(size_t size) {
-    size_t pre_size = size_;
-    resize(size);
-    resize(pre_size);
-  }
+  void reserve(size_t size) { m_.Data().reserve(size); }
 
   // the unify method to access CPU or CUDA data. immutable.
   const T *Data(platform::Place place) const {
@@ -248,12 +481,7 @@ class Vector {
   }
 
   // implicit cast operator. Vector can be cast to std::vector implicitly.
-  operator std::vector<T>() const {
-    std::vector<T> result;
-    result.resize(size());
-    std::copy(begin(), end(), result.begin());
-    return result;
-  }
+  operator std::vector<T>() const { return m_.Data(); }
 
   bool operator==(const Vector<T> &other) const {
     if (size() != other.size()) return false;
@@ -267,118 +495,11 @@ class Vector {
     return true;
   }
 
- private:
-  void InitEmpty() {
-    size_ = 0;
-    flag_ = kDataInCPU;
-  }
-
-  template <typename Iter>
-  void InitByIter(size_t size, Iter begin, Iter end) {
-    platform::Place cpu = platform::CPUPlace();
-    T *ptr = this->cpu_vec_.template mutable_data<T>(
-        framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-    for (size_t i = 0; i < size; ++i) {
-      *ptr++ = *begin++;
-    }
-    flag_ = kDataInCPU | kDirty;
-    size_ = size;
-  }
-
-  enum DataFlag {
-    kDataInCPU = 0x01,
-    kDataInCUDA = 0x02,
-    // kDirty means the data has been changed in one device.
-    kDirty = 0x10
-  };
-
-  void CopyToCPU() const {
-    // COPY GPU Data To CPU
-    TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
-    WaitPlace(cuda_vec_.place());
-  }
-
-  void MutableCPU() {
-    if (IsInCUDA() && IsDirty()) {
-      CopyToCPU();
-    }
-    flag_ = kDirty | kDataInCPU;
-  }
-
-  void ImmutableCUDA(platform::Place place) const {
-    if (IsDirty()) {
-      if (IsInCPU()) {
-        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
-                   &cuda_vec_);
-        WaitPlace(place);
-        UnsetFlag(kDirty);
-        SetFlag(kDataInCUDA);
-      } else if (IsInCUDA() && !(place == cuda_vec_.place())) {
-        framework::Tensor tmp;
-        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
-        WaitPlace(cuda_vec_.place());
-        cuda_vec_.ShareDataWith(tmp);
-        // Still dirty
-      } else {
-        // Dirty && DataInCUDA && Device is same
-        // Do nothing
-      }
-    } else {
-      if (!IsInCUDA()) {
-        // Even data is not dirty. However, data is not in CUDA. Copy data.
-        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
-                   &cuda_vec_);
-        WaitPlace(place);
-        SetFlag(kDataInCUDA);
-      } else if (!(place == cuda_vec_.place())) {
-        framework::Tensor tmp;
-        WaitPlace(cuda_vec_.place());
-        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
-        WaitPlace(cuda_vec_.place());
-        WaitPlace(place);
-        cuda_vec_.ShareDataWith(tmp);
-      } else {
-        // Not Dirty && DataInCUDA && Device is same
-        // Do nothing.
-      }
-    }
-  }
-
-  void ImmutableCPU() const {
-    if (IsDirty() &&
-        !IsInCPU()) {  // If data has been changed in CUDA, or CPU has no data.
-      CopyToCPU();
-      UnsetFlag(kDirty);
-    }
-    SetFlag(kDataInCPU);
-  }
-
-  void UnsetFlag(int flag) const { flag_ &= ~flag; }
-  void SetFlag(int flag) const { flag_ |= flag; }
+  const void *Handle() const { return &m_.Data(); }
 
-  bool IsDirty() const { return flag_ & kDirty; }
-
-  bool IsInCUDA() const { return flag_ & kDataInCUDA; }
-
-  bool IsInCPU() const { return flag_ & kDataInCPU; }
-
-  static void WaitPlace(const platform::Place place) {
-    if (platform::is_gpu_place(place)) {
-      platform::DeviceContextPool::Instance()
-          .Get(boost::get<platform::CUDAPlace>(place))
-          ->Wait();
-    }
-  }
-
-  static T &EmptyDummy() {
-    static T dummy = T();
-    return dummy;
-  }
-
-  mutable int flag_;
-  mutable Tensor cpu_vec_;
-  mutable Tensor cuda_vec_;
-  size_t size_;
+ private:
+  // Vector is an COW object.
+  mutable details::COWPtr<VectorData> m_;
 };
 
 #else  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index 06cf4a0f9f33af67343437baeb9623a35ddad183..19e5c2c73eac74dee030a4f7820531800f737e4e 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -38,31 +38,27 @@ struct OpInfo {
   OpAttrChecker* checker_{nullptr};
   InferVarTypeFN infer_var_type_;
   InferShapeFN infer_shape_;
-  std::string op_type_;
 
   bool HasOpProtoAndChecker() const {
     return proto_ != nullptr && checker_ != nullptr;
   }
 
   const proto::OpProto& Proto() const {
-    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator %s Proto has not been registered",
-                            op_type_);
+    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
     PADDLE_ENFORCE(proto_->IsInitialized(),
-                   "Operator %s Proto must be initialized in op info",
-                   op_type_);
+                   "Operator Proto must be initialized in op info");
     return *proto_;
   }
 
   const OpCreator& Creator() const {
-    PADDLE_ENFORCE_NOT_NULL(
-        creator_, "Operator %s Creator has not been registered", op_type_);
+    PADDLE_ENFORCE_NOT_NULL(creator_,
+                            "Operator Creator has not been registered");
     return creator_;
   }
 
   const GradOpMakerFN& GradOpMaker() const {
     PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
-                            "Operator %s GradOpMaker has not been registered.",
-                            op_type_);
+                            "Operator GradOpMaker has not been registered.");
     return grad_op_maker_;
   }
 
@@ -77,9 +73,8 @@ class OpInfoMap {
     return map_.find(op_type) != map_.end();
   }
 
-  void Insert(const std::string& type, OpInfo info) {
+  void Insert(const std::string& type, const OpInfo& info) {
     PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
-    info.op_type_ = type;
     map_.insert({type, info});
   }
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index f5a54c0f48c98512dcb393d63a61f3c927e7ac1f..855870b41c9629aaae88f009ffa908a91b25a931 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -19,15 +19,13 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -35,80 +33,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
-    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &param_names,
-    const std::vector<Scope *> &local_scopes, const bool use_cuda,
-#ifdef PADDLE_WITH_CUDA
-    const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
-#else
-    const BuildStrategy &strategy) {
-#endif
-  // Convert the program to graph.
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
-
-  // Apply a graph viz pass to record a graph.
-  if (!strategy.debug_graphviz_path_.empty()) {
-    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
-    const std::string graph_path = string::Sprintf(
-        "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
-    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
-    graph = viz_pass->Apply(std::move(graph));
-  }
-
-  // Apply op fusion.
-  if (strategy.fuse_elewise_add_act_ops_) {
-    auto fuse_elewise_add_act_pass =
-        ir::PassRegistry::Instance().Get("fuse_elewise_add_act_pass");
-    graph = fuse_elewise_add_act_pass->Apply(std::move(graph));
-    // Apply a graph viz pass to record a graph.
-    if (!strategy.debug_graphviz_path_.empty()) {
-      auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
-      const std::string graph_path = string::Sprintf(
-          "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
-      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
-      graph = viz_pass->Apply(std::move(graph));
-    }
-  }
-
-  // Convert graph to run on multi-devices.
-  auto multi_devices_pass =
-      ir::PassRegistry::Instance().Get("multi_devices_pass");
-  multi_devices_pass->SetNotOwned<const std::vector<platform::Place>>("places",
-                                                                      &places);
-  multi_devices_pass->SetNotOwned<const std::string>("loss_var_name",
-                                                     &loss_var_name);
-  multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
-      "params", &param_names);
-  multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
-                                                              &local_scopes);
-  multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
-
-#ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
-  multi_devices_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
-#endif
-  graph = multi_devices_pass->Apply(std::move(graph));
-
-  // Apply a graph print pass to record a graph with device info.
-  if (!strategy.debug_graphviz_path_.empty()) {
-    auto multi_devices_print_pass =
-        ir::PassRegistry::Instance().Get("multi_devices_print_pass");
-    multi_devices_print_pass->SetNotOwned<const std::string>(
-        "debug_graphviz_path", &strategy.debug_graphviz_path_);
-    multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
-        "graph_printer", new details::GraphvizSSAGraphPrinter);
-    graph = multi_devices_print_pass->Apply(std::move(graph));
-  }
-
-  // Verify that the graph is correct for multi-device executor.
-  auto multi_devices_check_pass =
-      ir::PassRegistry::Instance().Get("multi_devices_check_pass");
-  graph = multi_devices_check_pass->Apply(std::move(graph));
-  return graph;
-}
-
 class ParallelExecutorPrivate {
  public:
   explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
@@ -199,10 +123,9 @@ ParallelExecutor::ParallelExecutor(
 // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
 // ncclOp
 #ifdef PADDLE_WITH_CUDA
-  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
       main_program, member_->places_, loss_var_name, params,
-      member_->local_scopes_, member_->use_cuda_, build_strategy,
-      member_->nccl_ctxs_.get());
+      member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
 
   auto max_memory_size = GetEagerDeletionThreshold();
   if (max_memory_size >= 0) {
@@ -228,9 +151,9 @@ ParallelExecutor::ParallelExecutor(
     }
   }
 #else
-  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
-      main_program, member_->places_, loss_var_name, params,
-      member_->local_scopes_, member_->use_cuda_, build_strategy);
+  std::unique_ptr<ir::Graph> graph =
+      build_strategy.Apply(main_program, member_->places_, loss_var_name,
+                           params, member_->local_scopes_, member_->use_cuda_);
 #endif
 
   if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
@@ -373,12 +296,6 @@ ParallelExecutor::~ParallelExecutor() {
 
 }  // namespace framework
 }  // namespace paddle
-
-USE_PASS(fuse_elewise_add_act_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(multi_devices_pass);
-USE_PASS(multi_devices_check_pass);
-USE_PASS(multi_devices_print_pass);
 #ifdef PADDLE_WITH_CUDA
 USE_PASS(reference_count_pass);
 #endif
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index c64906ff230df5f2b7cc9f5c6b29d68956ab8f33..fd386a5987f11ff64964e95eb7e9b83572dc790c 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,14 +14,14 @@ limitations under the License. */
 
 #pragma once
 
-#include <paddle/fluid/framework/details/build_strategy.h>
 #include <atomic>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
+#include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc
index 5ca864cfdf7176850dd31dd42ef3306061a742cf..928e1ad8b9168e61ddc5782066a4aa29a4296a94 100644
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test {
     selected_rows_.reset(new SelectedRows(rows, height));
 
     Tensor* value = selected_rows_->mutable_value();
-    value->mutable_data<float>(
+    auto* data = value->mutable_data<float>(
         make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
+    for (int64_t i = 0; i < value->numel(); ++i) {
+      data[i] = static_cast<float>(i);
+    }
   }
 
  protected:
@@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
   ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
   ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
   ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
+  auto* dst_data = dst_tensor.value().data<float>();
+  for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) {
+    ASSERT_EQ(dst_data[i], static_cast<float>(i));
+  }
 }
 
 TEST(SelectedRows, SparseTable) {
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 3b5be7f3ee33c73a9704bafa9f1b736c8a3cd9ea..f90910ac0d0a897ef01d4ca2bd0bca575baf4c40 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -37,12 +37,16 @@ TEST(Analyzer, analysis_without_tensorrt) {
 TEST(Analyzer, analysis_with_tensorrt) {
   FLAGS_IA_enable_tensorrt_subgraph_engine = true;
   Argument argument;
+  argument.Set<int>("minimum_subgraph_size", new int(0));
+  argument.Set<int>("max_batch_size", new int(3));
+  argument.Set<int>("workspace_size", new int(1 << 20));
+  argument.Set<std::string>("precision_mode", new std::string("FP32"));
   argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
   analyser.Run(&argument);
 }
 
-void TestWord2vecPrediction(const std::string &model_path) {
+void TestWord2vecPrediction(const std::string& model_path) {
   NativeConfig config;
   config.model_dir = model_path;
   config.use_gpu = false;
@@ -73,8 +77,8 @@ void TestWord2vecPrediction(const std::string &model_path) {
   // The outputs' buffers are in CPU memory.
   for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
     LOG(INFO) << "data: "
-              << static_cast<float *>(outputs.front().data.data())[i];
-    PADDLE_ENFORCE(static_cast<float *>(outputs.front().data.data())[i],
+              << static_cast<float*>(outputs.front().data.data())[i];
+    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
                    result[i]);
   }
 }
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index 5652940ec6d4cc7ba9a1d3a3e65f7dca1690d8c4..cb549f4b50cf56154a951d16b58b022dbad3e990 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -97,8 +97,10 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
   }
 }
 
-void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
+void CreateTrtEngineOp(Node *node, Argument *argument,
                        framework::proto::BlockDesc *block) {
+  PADDLE_ENFORCE(argument->main_dfg.get());
+  const DataFlowGraph &graph = *(argument->main_dfg);
   static int counter{0};
   PADDLE_ENFORCE(node->IsFunctionBlock());
   framework::OpDesc desc;
@@ -204,7 +206,10 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
 
   PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
   // Set attrs
+
   SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
+  SetAttr(desc.Proto(), "max_batch_size", argument->Get<int>("max_batch_size"));
+  SetAttr(desc.Proto(), "workspace_size", argument->Get<int>("workspace_size"));
   SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
   SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
   SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
@@ -248,7 +253,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
   *block_desc.Proto()->mutable_vars() =
       argument_->origin_program_desc->blocks(0).vars();
   PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
-  CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
+  CreateTrtEngineOp(node, argument_, block_desc.Proto());
   auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
   auto *op = main_block->add_ops();
   PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index b879067d2f2f6294c50e0adb21f9399a7c36698a..526bbbadfe90c3064d7c620cc22e30f7fef99088 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -309,6 +309,8 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
 void SubGraphFuse::ReplaceNodesWithSubGraphs() {
   auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
   for (auto &subgraph : subgraphs) {
+    if (subgraph.size() <= argument_->Get<int>("minimum_subgraph_size"))
+      continue;
     std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
     // replace this sub-graph with the first node. Two steps: 1. Create a Block
     // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.h b/paddle/fluid/inference/analysis/subgraph_splitter.h
index a31afbe6933da8d3c7a88142cc12d63b98b55796..76e4fda0249e03c617d1b37c079dcd97f21387c1 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.h
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
@@ -20,6 +20,7 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/node.h"
 
@@ -63,8 +64,11 @@ class SubGraphFuse {
  public:
   using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;
 
-  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
-      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller,
+               Argument *argument)
+      : graph_(graph),
+        node_inside_subgraph_teller_(teller),
+        argument_(argument) {}
 
   // The main method which run all the logic.
   void operator()();
@@ -76,6 +80,7 @@ class SubGraphFuse {
  private:
   DataFlowGraph *graph_;
   NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+  Argument *argument_;
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
index 531a170512f727d891aa6644ee08a60c25f16876..e1dc89fab5fb76d456b07c316ab1cabe6de23b26 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -66,10 +66,12 @@ TEST(SubGraphSplitter, Split) {
 TEST(SubGraphSplitter, Fuse) {
   auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
+  Argument argument;
+  argument.Set<int>("minimum_subgraph_size", new int(3));
 
   size_t count0 = dfg.nodes.size();
 
-  SubGraphFuse fuse(&dfg, teller);
+  SubGraphFuse fuse(&dfg, teller, &argument);
   fuse();
 
   int count1 = 0;
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
index faf876de6d65d20cf7a084cd97392cfc8d791a42..cc1746ecb34c983d219693bcec17c8789c38fa9f 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
     : node_inside_subgraph_teller_(teller) {}
 
 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
-  SubGraphFuse(graph, node_inside_subgraph_teller_)();
+  SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)();
   VLOG(4) << "debug info "
           << graph->HumanReadableInfo(false /*show_values*/,
                                       true /*show_functions*/);
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
index 219e3f5470f627e81005aabf94f9c72c33fd2eed..3545da9109d79964f36c3d7e738620cc2e0f9a6c 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -33,7 +33,10 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
 
   explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
 
-  bool Initialize(Argument* argument) override { return true; }
+  bool Initialize(Argument* argument) override {
+    argument_ = argument;
+    return true;
+  }
 
   // This class get a sub-graph as input and determine whether to transform this
   // sub-graph into TensorRT.
@@ -46,6 +49,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
 
  private:
   NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+  Argument* argument_;
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
index 67a5af83d89b771536ea11be51b35244ff5c09d6..9748e24b06295a4e7c2995429e6588cd0f225fe6 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -36,6 +36,10 @@ TEST(TensorRTSubGraphPass, main) {
   };
 
   Argument argument(FLAGS_inference_model_dir);
+  argument.Set<int>("minimum_subgraph_size", new int(0));
+  argument.Set<int>("max_batch_size", new int(3));
+  argument.Set<int>("workspace_size", new int(1 << 20));
+  argument.Set<std::string>("precision_mode", new std::string("FP32"));
 
   DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
   DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index 6c7e63971b2d93f58e219dbd93637c8d389deb7c..5ee6a5a93168f58770067f76ca7f6bb6f67b2965 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -35,8 +35,6 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
   bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
     FLAGS_IA_enable_tensorrt_subgraph_engine = true;
     VLOG(3) << "Predictor::init()";
-    FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
-    FLAGS_tensorrt_workspace_size = config_.workspace_size;
     if (config_.use_gpu) {
       place_ = paddle::platform::CUDAPlace(config_.device);
     } else {
@@ -92,6 +90,14 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
   void OptimizeInferenceProgram() {
     // Analyze inference_program
     Argument argument;
+
+    argument.Set<int>("minimum_subgraph_size",
+                      new int(config_.minimum_subgraph_size));
+    argument.Set<int>("max_batch_size", new int(config_.max_batch_size));
+    argument.Set<int>("workspace_size", new int(config_.workspace_size));
+    argument.Set<std::string>("precision_mode",
+                              new std::string(config_.precision_mode));
+
     if (!config_.model_dir.empty()) {
       argument.fluid_model_dir.reset(new std::string(config_.model_dir));
     } else {
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 2b4e5ed73704041e18bdbce32338405f3601e082..984358b2bd90daf768cea0a6e36b5805d81050d6 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -194,6 +194,14 @@ struct MixedRTConfig : public NativeConfig {
   // For workspace_size, refer it from here:
   // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
   int workspace_size{1 << 30};
+  //  We transform the Ops that can be converted into TRT layer in the model,
+  //  and aggregate these Ops into subgraphs for TRT execution.
+  //  We set this variable to control the minimum number of nodes in the
+  //  subgraph, 3 as default value.
+  int minimum_subgraph_size = 3;
+  // Reserved configuration
+  // We just support "FP32" now, "FP16" and "INT8" will be supported.
+  std::string precision_mode = "FP32";
 };
 
 // NOTE WIP, not stable yet.
@@ -204,10 +212,11 @@ struct AnalysisConfig : public NativeConfig {
     kExclude   // Specify the disabled passes in `ir_passes`.
   };
 
+  // Determine whether to perform graph optimization.
   bool enable_ir_optim = true;
+  // Manually determine the IR passes to run.
   IrPassMode ir_mode{IrPassMode::kExclude};
-  // attention lstm fuse works only on some specific models, disable as default.
-  std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
+  std::vector<std::string> ir_passes;
 
   // NOTE this is just for internal development, please not use it.
   bool _use_mkldnn{false};
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index d7ab2ac980af2cf3bd9d95bfdbfa1887ef9a64d7..70f9e397c96cf3fe92779778950f3df71b5a67c9 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -90,3 +90,13 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
    endif()
 endif()
+
+if(WITH_GPU AND TENSORRT_FOUND)
+   set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
+   if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
+       inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
+   endif()
+   cc_test(test_trt_models SRCS trt_models_tester.cc  
+     ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models
+     DEPS paddle_inference_tensorrt_subgraph_engine)
+endif()
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bf320a0cbc2fff5f973c48768281e26d0fde232b
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+using paddle::contrib::MixedRTConfig;
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+NativeConfig GetConfigNative() {
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname;
+  // LOG(INFO) << "dirname  " << config.model_dir;
+  config.fraction_of_gpu_memory = 0.45;
+  config.use_gpu = true;
+  config.device = 0;
+  return config;
+}
+
+MixedRTConfig GetConfigTRT() {
+  MixedRTConfig config;
+  config.model_dir = FLAGS_dirname;
+  config.use_gpu = true;
+  config.fraction_of_gpu_memory = 0.2;
+  config.device = 0;
+  config.max_batch_size = 3;
+  return config;
+}
+
+void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
+  NativeConfig config0 = GetConfigNative();
+  config0.model_dir = model_dirname;
+
+  MixedRTConfig config1 = GetConfigTRT();
+  config1.model_dir = model_dirname;
+  config1.max_batch_size = batch_size;
+
+  auto predictor0 =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 =
+      CreatePaddlePredictor<MixedRTConfig,
+                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
+  // Prepare inputs
+  int height = 224;
+  int width = 224;
+  float *data = new float[batch_size * 3 * height * width];
+  memset(data, 0, sizeof(float) * (batch_size * 3 * height * width));
+  data[0] = 1.0f;
+
+  // Prepare inputs
+  PaddleTensor tensor;
+  tensor.name = "input_0";
+  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+  tensor.data = PaddleBuf(static_cast<void *>(data),
+                          sizeof(float) * (batch_size * 3 * height * width));
+  tensor.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+  // Prepare outputs
+  std::vector<PaddleTensor> outputs0;
+  std::vector<PaddleTensor> outputs1;
+  CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
+
+  CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
+
+  // Get output.
+  ASSERT_EQ(outputs0.size(), 1UL);
+  ASSERT_EQ(outputs1.size(), 1UL);
+
+  const size_t num_elements = outputs0.front().data.length() / sizeof(float);
+  const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
+  EXPECT_EQ(num_elements, num_elements1);
+
+  auto *data0 = static_cast<float *>(outputs0.front().data.data());
+  auto *data1 = static_cast<float *>(outputs1.front().data.data());
+
+  ASSERT_GT(num_elements, 0UL);
+  for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+    EXPECT_NEAR(data0[i], data1[i], 1e-3);
+  }
+}
+
+TEST(trt_models_test, main) {
+  std::vector<std::string> infer_models = {"mobilenet", "resnet50",
+                                           "resnext50"};
+  for (auto &model_dir : infer_models) {
+    CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir);
+  }
+}
+}  // namespace paddle
diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc
index dfaa7456f917c1308984b361afed752f96ea6f59..0784920064a879963cd9725cd9acf4cec7b874ce 100644
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -36,11 +36,16 @@ class AucOp : public framework::OperatorWithKernel {
                       "Out and Label should have same height.");
 
     int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
+    int slide_steps = ctx->Attrs().Get<int>("slide_steps");
+
+    PADDLE_ENFORCE_GE(num_pred_buckets, 1, "num_thresholds must larger than 1");
+    PADDLE_ENFORCE_GE(slide_steps, 0, "slide_steps must be natural number");
 
     ctx->SetOutputDim("AUC", {1});
-    ctx->SetOutputDim("BatchAUC", {1});
-    ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
-    ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
+
+    slide_steps = slide_steps == 0 ? 1 : slide_steps;
+    ctx->SetOutputDim("StatPosOut", {slide_steps, num_pred_buckets});
+    ctx->SetOutputDim("StatNegOut", {slide_steps, num_pred_buckets});
   }
 
  protected:
@@ -62,6 +67,7 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Label",
              "A 2D int tensor indicating the label of the training data. "
              "shape: [batch_size, 1]");
+
     // TODO(typhoonzero): support weight input
     AddInput("StatPos", "Statistic value when label = 1");
     AddInput("StatNeg", "Statistic value when label = 0");
@@ -69,18 +75,19 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("AUC",
               "A scalar representing the "
               "current area-under-the-curve.");
-    AddOutput("BatchAUC", "The AUC for current batch");
+
     AddOutput("StatPosOut", "Statistic value when label = 1");
     AddOutput("StatNegOut", "Statistic value when label = 0");
 
     AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
         .SetDefault("ROC");
 
-    AddAttr<int>("num_thresholds",
-                 "The number of thresholds to use when discretizing the"
-                 " roc curve.")
+    AddAttr<int>(
+        "num_thresholds",
+        "The number of thresholds to use when discretizing the roc curve.")
         .SetDefault((2 << 12) - 1);
-
+    AddAttr<int>("slide_steps", "Use slide steps to calc batch auc.")
+        .SetDefault(1);
     AddComment(R"DOC(
 Area Under The Curve (AUC) Operator.
 
diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h
index fb0517d70635e090f8c5b59ff9d8420fc34c747b..fb370842d1942c3b3eebecb1fe5e8ffb845cb34b 100644
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -32,7 +32,9 @@ class AucKernel : public framework::OpKernel<T> {
 
     std::string curve = ctx.Attr<std::string>("curve");
     int num_thresholds = ctx.Attr<int>("num_thresholds");
+    // buckets contain numbers from 0 to num_thresholds
     int num_pred_buckets = num_thresholds + 1;
+    int slide_steps = ctx.Attr<int>("slide_steps");
 
     // Only use output var for now, make sure it's persistable and
     // not cleaned up for each batch.
@@ -40,16 +42,19 @@ class AucKernel : public framework::OpKernel<T> {
     auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
     auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
 
-    auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
-    auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
-    calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
-            auc);
+    auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
+    auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
 
-    auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
-    std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
-    std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
-    calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
-            num_thresholds, batch_auc);
+    std::vector<int64_t> stat_pos_data(num_pred_buckets, 0);
+    std::vector<int64_t> stat_neg_data(num_pred_buckets, 0);
+
+    auto stat_pos_calc = stat_pos_data.data();
+    auto stat_neg_calc = stat_neg_data.data();
+
+    statAuc(label, predict, num_pred_buckets, num_thresholds, slide_steps,
+            origin_stat_pos, origin_stat_neg, &stat_pos_calc, &stat_neg_calc);
+
+    calcAuc(ctx, stat_pos_calc, stat_neg_calc, num_thresholds, auc);
   }
 
  private:
@@ -58,29 +63,76 @@ class AucKernel : public framework::OpKernel<T> {
     return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
   }
 
-  inline static void calcAuc(const framework::ExecutionContext &ctx,
-                             const framework::Tensor *label,
+  inline static void statAuc(const framework::Tensor *label,
                              const framework::Tensor *predict,
-                             int64_t *stat_pos, int64_t *stat_neg,
-                             int num_thresholds,
-                             framework::Tensor *auc_tensor) {
+                             const int num_pred_buckets,
+                             const int num_thresholds, const int slide_steps,
+                             int64_t *origin_stat_pos, int64_t *origin_stat_neg,
+                             int64_t **stat_pos, int64_t **stat_neg) {
     size_t batch_size = predict->dims()[0];
     size_t inference_width = predict->dims()[1];
     const T *inference_data = predict->data<T>();
     const auto *label_data = label->data<int64_t>();
 
-    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
-
     for (size_t i = 0; i < batch_size; i++) {
       uint32_t binIdx = static_cast<uint32_t>(
           inference_data[i * inference_width + 1] * num_thresholds);
       if (label_data[i]) {
-        stat_pos[binIdx] += 1.0;
+        (*stat_pos)[binIdx] += 1.0;
       } else {
-        stat_neg[binIdx] += 1.0;
+        (*stat_neg)[binIdx] += 1.0;
       }
     }
 
+    int bucket_length = num_pred_buckets * sizeof(int64_t);
+
+    // will stat auc unlimited.
+    if (slide_steps == 0) {
+      for (int slide = 0; slide < num_pred_buckets; ++slide) {
+        origin_stat_pos[slide] += (*stat_pos)[slide];
+        origin_stat_neg[slide] += (*stat_neg)[slide];
+      }
+
+      *stat_pos = origin_stat_pos;
+      *stat_neg = origin_stat_neg;
+
+    } else {
+      for (int slide = 1; slide < slide_steps; ++slide) {
+        int dst_idx = (slide - 1) * num_pred_buckets;
+        int src_inx = slide * num_pred_buckets;
+        std::memcpy(origin_stat_pos + dst_idx, origin_stat_pos + src_inx,
+                    bucket_length);
+        std::memcpy(origin_stat_neg + dst_idx, origin_stat_neg + src_inx,
+                    bucket_length);
+      }
+
+      std::memcpy(origin_stat_pos + (slide_steps - 1) * num_pred_buckets,
+                  *stat_pos, bucket_length);
+      std::memcpy(origin_stat_neg + (slide_steps - 1) * num_pred_buckets,
+                  *stat_neg, bucket_length);
+
+      std::memset(*stat_pos, 0, bucket_length);
+      std::memset(*stat_neg, 0, bucket_length);
+
+      for (int slide = 0; slide < num_pred_buckets; ++slide) {
+        int stat_pos_steps = 0;
+        int stat_neg_steps = 0;
+        for (int step = 0; step < slide_steps; ++step) {
+          stat_pos_steps += origin_stat_pos[slide + step * num_pred_buckets];
+          stat_neg_steps += origin_stat_neg[slide + step * num_pred_buckets];
+        }
+        (*stat_pos)[slide] += stat_pos_steps;
+        (*stat_neg)[slide] += stat_neg_steps;
+      }
+    }
+  }
+
+  inline static void calcAuc(const framework::ExecutionContext &ctx,
+                             int64_t *stat_pos, int64_t *stat_neg,
+                             int num_thresholds,
+                             framework::Tensor *auc_tensor) {
+    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
+
     *auc = 0.0f;
 
     double totPos = 0.0;
@@ -96,7 +148,6 @@ class AucKernel : public framework::OpKernel<T> {
       totPos += stat_pos[idx];
       totNeg += stat_neg[idx];
       *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
-
       --idx;
     }
 
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 5a058ddbc59c6135bacf7c2dc4b5c8b687f9b2b1..aa8ed502fc94bd0970dfe5dbf00ef090e799ad30 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -30,7 +30,13 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
 polygon_box_transform_op.cu)
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
-detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
+
+if(WITH_GPU)
+  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
+else()
+  detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
+endif()
+
 detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
 #Export local libraries to parent
 set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index c33aa255362bc5234f2813fb93e70c943b03c33f..818d58ea9ee327fd99182ad2f8cbeed07e6aaea2 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -69,7 +70,7 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()),
-        platform::CPUPlace());
+        ctx.device_context());
   }
 };
 
@@ -162,7 +163,7 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
   const T *im_info_data = im_info.data<T>();
   T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
   T im_scale = im_info_data[2];
-  keep->Resize({boxes->dims()[0], 1});
+  keep->Resize({boxes->dims()[0]});
   min_size = std::max(min_size, 1.0f);
   int *keep_data = keep->mutable_data<int>(ctx.GetPlace());
 
@@ -463,7 +464,7 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("post_nms_topN", "post_nms_topN");
     AddAttr<float>("nms_thresh", "nms_thres");
     AddAttr<float>("min_size", "min size");
-    AddAttr<float>("eta", "eta");
+    AddAttr<float>("eta", "The parameter for adaptive NMS.");
     AddComment(R"DOC(
 Generate Proposals OP
 
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6146ff509d768c0317a5c65ed22af1a3075977a2
--- /dev/null
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -0,0 +1,449 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+namespace {
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+int const kThreadsPerBlock = sizeof(uint64_t) * 8;
+
+template <typename T>
+__global__ void RangeInitKernel(const T start, const T delta, const int size,
+                                T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
+}
+
+template <typename T>
+void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
+                    Tensor *value_out, Tensor *index_out) {
+  int num = value.numel();
+  Tensor index_in_t;
+  int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
+  int block = 512;
+  auto stream = ctx.stream();
+  RangeInitKernel<<<DIVUP(num, block), block, 0, stream>>>(0, 1, num, idx_in);
+  int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace());
+
+  const T *keys_in = value.data<T>();
+  T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace());
+
+  // Determine temporary device storage requirements
+  void *d_temp_storage = NULL;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceRadixSort::SortPairsDescending<T, int>(
+      d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
+      num);
+
+  // Allocate temporary storage
+  auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+  d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+
+  // Run sorting operation
+  cub::DeviceRadixSort::SortPairsDescending<T, int>(
+      d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
+      num);
+
+  memory::Free(place, d_temp_storage);
+}
+
+template <typename T>
+__device__ __forceinline__ T Min(T x, T y) {
+  return x < y ? x : y;
+}
+
+template <typename T>
+__device__ __forceinline__ T Max(T x, T y) {
+  return x > y ? x : y;
+}
+
+template <typename T>
+__global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
+                                       const T *var, const int *index,
+                                       const T *im_info, const int num,
+                                       T *proposals) {
+  T kBBoxClipDefault = log(1000.0 / 16.0);
+  CUDA_1D_KERNEL_LOOP(i, num) {
+    int k = index[i] * 4;
+    T axmin = anchor[k];
+    T aymin = anchor[k + 1];
+    T axmax = anchor[k + 2];
+    T aymax = anchor[k + 3];
+
+    T w = axmax - axmin + 1.0;
+    T h = aymax - aymin + 1.0;
+    T cx = axmin + 0.5 * w;
+    T cy = aymin + 0.5 * h;
+
+    T dxmin = deltas[k];
+    T dymin = deltas[k + 1];
+    T dxmax = deltas[k + 2];
+    T dymax = deltas[k + 3];
+
+    T d_cx = 0., d_cy = 0., d_w = 0., d_h = 0.;
+    if (var) {
+      d_cx = cx + dxmin * w * var[k];
+      d_cy = cy + dymin * h * var[k + 1];
+      d_w = exp(Min<T>(dxmax * var[k + 2], kBBoxClipDefault)) * w;
+      d_h = exp(Min<T>(dymax * var[k + 3], kBBoxClipDefault)) * h;
+    } else {
+      d_cx = cx + dxmin * w;
+      d_cy = cy + dymin * h;
+      d_w = exp(Min<T>(dxmax, kBBoxClipDefault)) * w;
+      d_h = exp(Min<T>(dymax, kBBoxClipDefault)) * h;
+    }
+
+    T oxmin = d_cx - d_w * 0.5;
+    T oymin = d_cy - d_h * 0.5;
+    T oxmax = d_cx + d_w * 0.5 - 1.;
+    T oymax = d_cy + d_h * 0.5 - 1.;
+
+    proposals[i * 4] = Max<T>(Min<T>(oxmin, im_info[1] - 1.), 0.);
+    proposals[i * 4 + 1] = Max<T>(Min<T>(oymin, im_info[0] - 1.), 0.);
+    proposals[i * 4 + 2] = Max<T>(Min<T>(oxmax, im_info[1] - 1.), 0.);
+    proposals[i * 4 + 3] = Max<T>(Min<T>(oymax, im_info[0] - 1.), 0.);
+  }
+}
+
+template <typename T, int BlockSize>
+__global__ void FilterBBoxes(const T *bboxes, const T *im_info,
+                             const T min_size, const int num, int *keep_num,
+                             int *keep) {
+  T im_h = im_info[0];
+  T im_w = im_info[1];
+  T im_scale = im_info[2];
+
+  int cnt = 0;
+  __shared__ int keep_index[BlockSize];
+
+  CUDA_1D_KERNEL_LOOP(i, num) {
+    keep_index[threadIdx.x] = -1;
+    __syncthreads();
+
+    int k = i * 4;
+    T xmin = bboxes[k];
+    T ymin = bboxes[k + 1];
+    T xmax = bboxes[k + 2];
+    T ymax = bboxes[k + 3];
+
+    T w = xmax - xmin + 1.0;
+    T h = ymax - ymin + 1.0;
+    T cx = xmin + w / 2.;
+    T cy = ymin + h / 2.;
+
+    T w_s = (xmax - xmin) / im_scale + 1.;
+    T h_s = (ymax - ymin) / im_scale + 1.;
+
+    if (w_s >= min_size && h_s >= min_size && cx <= im_w && cy <= im_h) {
+      keep_index[threadIdx.x] = i;
+    }
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      int size = (num - i) < BlockSize ? num - i : BlockSize;
+      for (int j = 0; j < size; ++j) {
+        if (keep_index[j] > -1) {
+          keep[cnt++] = keep_index[j];
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (threadIdx.x == 0) {
+    keep_num[0] = cnt;
+  }
+}
+
+__device__ inline float IoU(const float *a, const float *b) {
+  float left = max(a[0], b[0]), right = min(a[2], b[2]);
+  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  float inter_s = width * height;
+  float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+  float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  return inter_s / (s_a + s_b - inter_s);
+}
+
+__global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
+                          const float *dev_boxes, uint64_t *dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  const int row_size =
+      min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock);
+
+  __shared__ float block_boxes[kThreadsPerBlock * 4];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 4 + 0] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0];
+    block_boxes[threadIdx.x * 4 + 1] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1];
+    block_boxes[threadIdx.x * 4 + 2] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2];
+    block_boxes[threadIdx.x * 4 + 3] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x;
+    const float *cur_box = dev_boxes + cur_box_idx * 4;
+    int i = 0;
+    uint64_t t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+template <typename T>
+void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
+         const Tensor &sorted_indices, const T nms_threshold,
+         Tensor *keep_out) {
+  int boxes_num = proposals.dims()[0];
+  PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]);
+
+  const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);
+  dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock),
+              DIVUP(boxes_num, kThreadsPerBlock));
+  dim3 threads(kThreadsPerBlock);
+
+  const T *boxes = proposals.data<T>();
+  auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+  int size_bytes = boxes_num * col_blocks * sizeof(uint64_t);
+  uint64_t *d_mask =
+      reinterpret_cast<uint64_t *>(memory::Alloc(place, size_bytes));
+  NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes, d_mask);
+  uint64_t *h_mask = reinterpret_cast<uint64_t *>(
+      memory::Alloc(platform::CPUPlace(), size_bytes));
+  memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0);
+
+  std::vector<uint64_t> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
+
+  std::vector<int> keep_vec;
+  int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / kThreadsPerBlock;
+    int inblock = i % kThreadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      ++num_to_keep;
+      keep_vec.push_back(i);
+      uint64_t *p = &h_mask[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+  int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace());
+  memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(),
+               sizeof(int) * num_to_keep, 0);
+  memory::Free(place, d_mask);
+  memory::Free(platform::CPUPlace(), h_mask);
+}
+
+template <typename T>
+std::pair<Tensor, Tensor> ProposalForOneImage(
+    const platform::CUDADeviceContext &ctx, const Tensor &im_info,
+    const Tensor &anchors, const Tensor &variances,
+    const Tensor &bbox_deltas,  // [M, 4]
+    const Tensor &scores,       // [N, 1]
+    int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
+    float eta) {
+  // 1. pre nms
+  Tensor scores_sort, index_sort;
+  SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
+  int num = scores.numel();
+  int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
+                                                                : pre_nms_top_n;
+  scores_sort.Resize({pre_nms_num, 1});
+  index_sort.Resize({pre_nms_num, 1});
+
+  // 2. box decode and clipping
+  Tensor proposals;
+  proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
+  int block = 512;
+  auto stream = ctx.stream();
+  BoxDecodeAndClipKernel<T><<<DIVUP(pre_nms_num, block), block, 0, stream>>>(
+      anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
+      index_sort.data<int>(), im_info.data<T>(), pre_nms_num,
+      proposals.data<T>());
+
+  // 3. filter
+  Tensor keep_index, keep_num_t;
+  keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
+  keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
+  min_size = std::max(min_size, 1.0f);
+  FilterBBoxes<T, 512><<<1, 512, 0, stream>>>(
+      proposals.data<T>(), im_info.data<T>(), min_size, pre_nms_num,
+      keep_num_t.data<int>(), keep_index.data<int>());
+  int keep_num;
+  const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+  memory::Copy(platform::CPUPlace(), &keep_num, gpu_place,
+               keep_num_t.data<int>(), sizeof(int), 0);
+  keep_index.Resize({keep_num});
+
+  Tensor scores_filter, proposals_filter;
+  proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
+  scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
+  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+
+  if (nms_thresh <= 0) {
+    return std::make_pair(proposals_filter, scores_filter);
+  }
+
+  // 4. nms
+  Tensor keep_nms;
+  NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
+  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
+    keep_nms.Resize({post_nms_top_n});
+  }
+
+  Tensor scores_nms, proposals_nms;
+  proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
+  scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
+  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+
+  return std::make_pair(proposals_nms, scores_nms);
+}
+}  // namespace
+
+template <typename DeviceContext, typename T>
+class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *scores = context.Input<Tensor>("Scores");
+    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
+    auto *im_info = context.Input<Tensor>("ImInfo");
+    auto *anchors = context.Input<Tensor>("Anchors");
+    auto *variances = context.Input<Tensor>("Variances");
+
+    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
+    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
+
+    int pre_nms_top_n = context.Attr<int>("pre_nms_topN");
+    int post_nms_top_n = context.Attr<int>("post_nms_topN");
+    float nms_thresh = context.Attr<float>("nms_thresh");
+    float min_size = context.Attr<float>("min_size");
+    float eta = context.Attr<float>("eta");
+    PADDLE_ENFORCE_GE(eta, 1., "Not support adaptive NMS.");
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+
+    auto scores_dim = scores->dims();
+    int64_t num = scores_dim[0];
+    int64_t c_score = scores_dim[1];
+    int64_t h_score = scores_dim[2];
+    int64_t w_score = scores_dim[3];
+
+    auto bbox_dim = bbox_deltas->dims();
+    int64_t c_bbox = bbox_dim[1];
+    int64_t h_bbox = bbox_dim[2];
+    int64_t w_bbox = bbox_dim[3];
+
+    Tensor bbox_deltas_swap, scores_swap;
+    bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
+                                     dev_ctx.GetPlace());
+    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
+                                dev_ctx.GetPlace());
+
+    math::Transpose<DeviceContext, T, 4> trans;
+    std::vector<int> axis = {0, 2, 3, 1};
+    trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
+    trans(dev_ctx, *scores, &scores_swap, axis);
+
+    Tensor *anchor = const_cast<framework::Tensor *>(anchors);
+    anchor->Resize({anchors->numel() / 4, 4});
+    Tensor *var = const_cast<framework::Tensor *>(variances);
+    var->Resize({var->numel() / 4, 4});
+
+    rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
+                              context.GetPlace());
+    rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
+
+    T *rpn_rois_data = rpn_rois->data<T>();
+    T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
+
+    auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
+
+    int64_t num_proposals = 0;
+    std::vector<size_t> offset(1, 0);
+    for (int64_t i = 0; i < num; ++i) {
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+
+      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
+      scores_slice.Resize({h_score * w_score * c_score, 1});
+
+      std::pair<Tensor, Tensor> box_score_pair =
+          ProposalForOneImage<T>(dev_ctx, im_info_slice, *anchor, *var,
+                                 bbox_deltas_slice, scores_slice, pre_nms_top_n,
+                                 post_nms_top_n, nms_thresh, min_size, eta);
+
+      Tensor proposals = box_score_pair.first;
+      Tensor scores = box_score_pair.second;
+
+      memory::Copy(place, rpn_rois_data + num_proposals * 4, place,
+                   proposals.data<T>(), sizeof(T) * proposals.numel(), 0);
+      memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
+                   scores.data<T>(), sizeof(T) * scores.numel(), 0);
+      num_proposals += proposals.dims()[0];
+      offset.emplace_back(num_proposals);
+    }
+    framework::LoD lod;
+    lod.emplace_back(offset);
+    rpn_rois->set_lod(lod);
+    rpn_roi_probs->set_lod(lod);
+    rpn_rois->Resize({num_proposals, 4});
+    rpn_roi_probs->Resize({num_proposals, 1});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(generate_proposals,
+                        ops::CUDAGenerateProposalsKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
index dd1ab85fd8d0c8170afcd9dd2a49ee55c41dc8be..dd5d138a1e979826d59c4731920379b030e3b492 100644
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
     int class_num = ctx.Attr<int>("class_num");
 
-    auto label_lod = in_label->lod();
-    auto detect_lod = in_detect->lod();
+    auto& label_lod = in_label->lod();
+    auto& detect_lod = in_detect->lod();
     PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
                       "Only support one level sequence now.");
     PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     auto labels = framework::EigenTensor<T, 2>::From(input_label);
     auto detect = framework::EigenTensor<T, 2>::From(input_detect);
 
-    auto label_lod = input_label.lod();
-    auto detect_lod = input_detect.lod();
+    auto& label_lod = input_label.lod();
+    auto& detect_lod = input_detect.lod();
 
     int batch_size = label_lod[0].size() - 1;
-    auto label_index = label_lod[0];
+    auto& label_index = label_lod[0];
 
     for (int n = 0; n < batch_size; ++n) {
       std::map<int, std::vector<Box>> boxes;
@@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
 
     output_true_pos->set_lod(true_pos_lod);
     output_false_pos->set_lod(false_pos_lod);
-    return;
   }
 
   void GetInputPos(const framework::Tensor& input_pos_count,
@@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     auto SetData = [](const framework::LoDTensor& pos_tensor,
                       std::map<int, std::vector<std::pair<T, int>>>& pos) {
       const T* pos_data = pos_tensor.data<T>();
-      auto pos_data_lod = pos_tensor.lod()[0];
+      auto& pos_data_lod = pos_tensor.lod()[0];
       for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
         for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
           T score = pos_data[j * 2];
@@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
     int batch_size = gt_boxes.size();
     for (int n = 0; n < batch_size; ++n) {
-      auto image_gt_boxes = gt_boxes[n];
-      for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) {
+      auto& image_gt_boxes = gt_boxes[n];
+      for (auto& image_gt_box : image_gt_boxes) {
         size_t count = 0;
-        auto labeled_bboxes = it->second;
+        auto& labeled_bboxes = image_gt_box.second;
         if (evaluate_difficult) {
           count = labeled_bboxes.size();
         } else {
-          for (size_t i = 0; i < labeled_bboxes.size(); ++i)
-            if (!(labeled_bboxes[i].is_difficult)) ++count;
+          for (auto& box : labeled_bboxes) {
+            if (!box.is_difficult) {
+              ++count;
+            }
+          }
         }
         if (count == 0) {
           continue;
         }
-        int label = it->first;
+        int label = image_gt_box.first;
         if (label_pos_count->find(label) == label_pos_count->end()) {
           (*label_pos_count)[label] = count;
         } else {
diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc
index 9a297d03cfb041e584159a5fc5ba214f8ac404b4..3acae3bcdf4a509ab6e7e19f21c4b2ec4d72b7d7 100644
--- a/paddle/fluid/operators/extract_rows_op.cc
+++ b/paddle/fluid/operators/extract_rows_op.cc
@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
     auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
     auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
 
-    auto in_rows = in.rows();
+    auto &in_rows = in.rows();
     auto out_dim = framework::make_ddim(
         std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
     auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
index 8ca79d20ec4f6412b00dbf3990068f81b65e2efd..23e8edd18d037a7f9127482951f25be3abf1b62f 100644
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -76,12 +76,18 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
   PADDLE_ENFORCE_EQ(b_dims[0], 1,
                     "The first dimension of Input(Bias) should be 1.");
-  PADDLE_ENFORCE_EQ(
-      b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
-      "The second dimension of Input(Bias) should be "
-      "7 * %d if enable peepholes connection or"
-      "4 * %d if disable peepholes",
-      frame_size, frame_size);
+  if (ctx->Attrs().Get<bool>("use_peepholes")) {
+    PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
+                      "The second dimension of Input(Bias) should be "
+                      "7 * %d if enable peepholes connection",
+                      frame_size);
+    ctx->SetOutputDim("CheckedCell", {2, frame_size});
+  } else {
+    PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                      "The second dimension of Input(Bias) should be "
+                      "4 * %d if disable peepholes",
+                      frame_size);
+  }
 
   framework::DDim out_dims({x_dims[0], frame_size});
   ctx->SetOutputDim("Hidden", out_dims);
@@ -173,6 +179,8 @@ void FusionLSTMOpMaker::Make() {
   AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
   AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
   AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
+  AddOutput("CheckedCell", "(Tensor) (2 x D) only for peephole.")
+      .AsIntermediate();
   AddAttr<bool>("use_peepholes",
                 "(bool, defalut: True) "
                 "whether to enable diagonal/peephole connections.")
@@ -250,19 +258,19 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
   const int D3 = D * 3;                  \
   const int D4 = wh_dims[1];
 
-#define INIT_BASE_INPUT_DATAS                                        \
-  const T* x_data = x->data<T>();                                    \
-  const T* wx_data = wx->data<T>();                                  \
-  const T* wh_data = wh->data<T>();                                  \
-  /* diagonal weight*/                                               \
-  const T* wc_data = bias->data<T>() + D4;                           \
-  /* for peephole only*/                                             \
-  Tensor checked_cell;                                               \
-  T* checked_cell_data = nullptr;                                    \
-  auto place = ctx.GetPlace();                                       \
-  if (use_peepholes) {                                               \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                 \
-    checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \
+#define INIT_BASE_INPUT_DATAS                                 \
+  const T* x_data = x->data<T>();                             \
+  const T* wx_data = wx->data<T>();                           \
+  const T* wh_data = wh->data<T>();                           \
+  /* diagonal weight*/                                        \
+  const T* wc_data = bias->data<T>() + D4;                    \
+  /* for peephole only*/                                      \
+  T* checked_cell_data = nullptr;                             \
+  auto place = ctx.GetPlace();                                \
+  if (use_peepholes) {                                        \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/          \
+    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");   \
+    checked_cell_data = checked_cell->mutable_data<T>(place); \
   }
 
 /// Compute LSTM
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 74823dab09cac358f647c074ac2f2ee2fed17e55..abd5dce8f7e7146a1671a387328c177e5e6e0a85 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
 
       // TODO(yuyang18): Strange code here.
-      memory::Copy(platform::CPUPlace(),
-                   new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
-                   ids_data, ids_num * sizeof(int64_t), stream);
-
+      memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
+                   gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index b27880c232a51d32777569cf9ac67656ce02f232..ba8eccf82042b679f69a32f9d053f05ac8fb9a99 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -60,11 +60,9 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
     auto out_place = context.GetPlace();
     PADDLE_ENFORCE(platform::is_gpu_place(out_place));
 
-    memory::Copy(
-        boost::get<platform::CUDAPlace>(out_place), out_data,
-        boost::get<platform::CUDAPlace>(in1_place), in1_data,
-        in1_value.numel() * sizeof(T),
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+    memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
+                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T), context.stream());
 
     auto* in2_data = in2_value.data<T>();
     memory::Copy(boost::get<platform::CUDAPlace>(out_place),
@@ -148,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     auto in1_height = input1.height();
     PADDLE_ENFORCE_EQ(in1_height, input2->height());
 
-    framework::Vector<int64_t> in1_rows(input1.rows());
+    auto& in1_rows = input1.rows();
     auto& in2_rows = *(input2->mutable_rows());
 
     auto& in1_value = input1.value();
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 326c58ee1c09d6f745e6c8abfb92030d11d8c1c6..a0d640b2020958af53a4405ae886eadb2a1e117e 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -45,12 +45,10 @@ class ReadInferVarType : public framework::VarTypeInference {
     framework::VarDesc* reader = block->FindVarRecursive(reader_name);
     auto dtypes = reader->GetDataTypes();
     PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
-    auto lod_levels = reader->GetLoDLevels();
     for (size_t i = 0; i < dtypes.size(); ++i) {
       framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
       out.SetType(framework::proto::VarType::LOD_TENSOR);
       out.SetDataType(dtypes[i]);
-      out.SetLoDLevel(lod_levels[i]);
     }
   }
 };
diff --git a/paddle/fluid/operators/sampling_id_op.cc b/paddle/fluid/operators/sampling_id_op.cc
index 724463c95c4a29fb5c00fe791b389d3908771640..a4f41a170426a4650fd3bf8f7fec4758ff34e1b9 100644
--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
@@ -53,15 +53,16 @@ class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
 SamplingId Operator.
 A layer for sampling id from multinomial distribution from the
  input. Sampling one id for one sample.)DOC");
-    AddAttr<float>("min", "Minimum value of random. [default 0.0].")
+    AddAttr<float>("min", "Minimum value of random. (float, default 0.0).")
         .SetDefault(0.0f);
-    AddAttr<float>("max", "Maximun value of random. [default 1.0].")
+    AddAttr<float>("max", "Maximun value of random. (float, default 1.0).")
         .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "Random seed used for the random number engine. "
-                 "0 means use a seed generated by the system."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time. [default 0].")
+    AddAttr<int>(
+        "seed",
+        "Random seed used for the random number engine. "
+        "0 means use a seed generated by the system."
+        "Note that if seed is not 0, this operator will always "
+        "generate the same random numbers every time. (int, default 0).")
         .SetDefault(0);
   }
 };
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index bf4df4f600c14050b636b7ee6d7b6973b57adb94..981969d2aaa684731a615ec64ca7f7718b35cf09 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -77,8 +77,10 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference {
     auto out_var_name = op_desc.Output("Out").front();
     auto *out_var = block->FindVarRecursive(out_var_name);
 
-    out_var->SetType(in_var.GetType());
-    out_var->SetDataType(in_var.GetDataType());
+    if (in_var_name != out_var_name) {
+      out_var->SetType(in_var.GetType());
+      out_var->SetDataType(in_var.GetDataType());
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu
index 9527e7ba300e10a6af1a0dd4b312c0323115256e..243609075713305a90dc162991166ba24d54e835 100644
--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
@@ -88,7 +88,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
 
       auto& in_value = grad->value();
-      framework::Vector<int64_t> in_rows(grad->rows());
+      auto& in_rows = grad->rows();
 
       int64_t in_row_numel = in_value.numel() / in_rows.size();
       PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index e008e130e34f60a78bf44e211c42c4b7786d1721..29d2fb989754f5621222768a279a1c898ea1c355 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -52,26 +52,16 @@ class ShrinkRNNMemoryOp : public ArrayOp {
     size_t height = dst_num_rows;
 
     // do shrink for the top level LoD
-
     if (x_tensor.lod().size() > 0 &&
         x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
-      if (x_tensor.lod().size() > 1) {  // MultiLevel LoD
-        auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(
-            x_tensor.lod(), 0, dst_num_rows, 0);
-        height = lod_offset.second.second;
-        auto out_lod = out_tensor.mutable_lod();
-        framework::AppendLoD(out_lod, lod_offset.first);
-      } else {
-        // Shrink LoD
-        auto lod_item = x_tensor.lod()[0];
-        lod_item.resize(dst_num_rows + 1);
-        out_tensor.set_lod({lod_item});
-        const auto &const_lod_item = lod_item;
-        height = const_lod_item.back();
-      }
+      auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0,
+                                                              dst_num_rows, 0);
+      height = lod_offset.second.second;
+      auto out_lod = out_tensor.mutable_lod();
+      framework::AppendLoD(out_lod, lod_offset.first);
     }
 
-    if (height != 0) {
+    if (dst_num_rows != 0) {
       out_tensor.mutable_data(place, x_tensor.type());
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
       framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx,
@@ -144,11 +134,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
     } else {
       auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
       auto height = dout_tensor.dims()[0];
-      if (height != 0) {
-        auto slice = dx_tensor.Slice(0, static_cast<int>(height));
-        framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx,
-                              &slice);
-      }
+      auto slice = dx_tensor.Slice(0, static_cast<int>(height));
+      framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
       if (dx_tensor.dims()[0] > height) {
         auto rest_tensor = dx_tensor.Slice(
             static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 6dffe527c1072ee97fcde1725bfc1a47ed1ad74a..34403c7a7aa717cca470be2931009e219e00e3ae 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -32,7 +32,7 @@ class SumKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto in_vars = context.MultiInputVar("X");
-    int N = in_vars.size();
+    size_t in_num = in_vars.size();
     auto out_var = context.OutputVar("Out");
 
     bool in_place = out_var == in_vars[0];
@@ -53,7 +53,7 @@ class SumKernel : public framework::OpKernel<T> {
       auto &place =
           *context.template device_context<DeviceContext>().eigen_device();
       // If in_place, just skip the first tensor
-      for (int i = in_place ? 1 : 0; i < N; i++) {
+      for (size_t i = in_place ? 1 : 0; i < in_num; i++) {
         if (in_vars[i]->IsType<framework::LoDTensor>()) {
           auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
           if (in_t.numel() == 0) {
@@ -101,13 +101,13 @@ class SumKernel : public framework::OpKernel<T> {
 
       // Runtime InferShape
       size_t first_dim = 0;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
         auto &sel_row = get_selected_row(i);
         first_dim += sel_row.rows().size();
       }
 
       std::vector<int64_t> in_dim;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
         auto &sel_row = get_selected_row(i);
         if (sel_row.rows().size() > 0) {
           in_dim = framework::vectorize(sel_row.value().dims());
@@ -116,14 +116,14 @@ class SumKernel : public framework::OpKernel<T> {
       }
       if (in_dim.empty()) {
         VLOG(3) << "WARNING: all the inputs are empty";
-        in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
+        in_dim =
+            framework::vectorize(get_selected_row(in_num - 1).value().dims());
       } else {
         in_dim[0] = static_cast<int64_t>(first_dim);
       }
 
       out_value->Resize(framework::make_ddim(in_dim));
       out_value->mutable_data<T>(context.GetPlace());
-
       // if all the input sparse vars are empty, no need to
       // merge these vars.
       if (first_dim == 0UL) {
@@ -133,7 +133,7 @@ class SumKernel : public framework::OpKernel<T> {
       math::SelectedRowsAddTo<DeviceContext, T> functor;
 
       int64_t offset = 0;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
         auto &sel_row = get_selected_row(i);
         if (sel_row.rows().size() == 0) {
           continue;
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index 1048d3017140c9e31426a1580b2862667116a024..41a5786fe8c3295390144732221280e152d0a15a 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -22,8 +22,6 @@
 namespace paddle {
 
 DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
-DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size");
-DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size");
 
 namespace operators {
 
@@ -34,6 +32,8 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Ys", "A list of outputs").AsDuplicable();
     AddAttr<std::string>("subgraph", "the subgraph.");
     AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
+    AddAttr<int>("max_batch_size", "the maximum batch size.");
+    AddAttr<int>("workspace_size", "the workspace size.");
     AddComment("TensorRT engine operator.");
   }
 };
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 69173ff5178d32634f9ab291b7d709a3f91cb368..3c78c29c1a30d74947be84cd2b52ad308e732a2d 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -28,8 +28,6 @@
 namespace paddle {
 
 DECLARE_int32(tensorrt_engine_batch_size);
-DECLARE_int32(tensorrt_max_batch_size);
-DECLARE_int32(tensorrt_workspace_size);
 
 namespace operators {
 
@@ -92,14 +90,14 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto engine_name = context.Attr<std::string>("engine_uniq_key");
+    int max_batch_size = context.Attr<int>("max_batch_size");
     if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
       Prepare(context);
     }
     auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
     auto input_names = context.op().Inputs("Xs");
     PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
-    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
-                      FLAGS_tensorrt_max_batch_size);
+    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);
 
     std::vector<std::string> output_maps =
         context.Attr<std::vector<std::string>>("output_name_mapping");
@@ -173,8 +171,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     // Get the ProgramDesc and pass to convert.
     framework::proto::BlockDesc block_desc;
     block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-    int max_batch = FLAGS_tensorrt_max_batch_size;
-    auto max_workspace = FLAGS_tensorrt_workspace_size;
+    int max_batch_size = context.Attr<int>("max_batch_size");
+    int workspace_size = context.Attr<int>("workspace_size");
+
     auto params = context.Attr<std::vector<std::string>>("parameters");
     std::unordered_set<std::string> parameters;
     for (const auto& param : params) {
@@ -186,7 +185,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
 
     // TODO(Superjomn) replace this with a different stream
     auto* engine = Singleton<TRT_EngineManager>::Global().Create(
-        max_batch, max_workspace, nullptr /*engine hold its own stream*/,
+        max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
         context.Attr<std::string>("engine_uniq_key"),
         boost::get<platform::CUDAPlace>(context.GetPlace()).device);
 
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 27c1d29762b3de5e57f877b271aae52e71eb7cf9..e21101e8d12f210af08284dbcebe5c14c1af6dd3 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -58,8 +58,6 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
 using inference::analysis::SetAttr;
 
 TEST(TensorRTEngineOp, manual) {
-  FLAGS_tensorrt_engine_batch_size = 2;
-  FLAGS_tensorrt_max_batch_size = 2;
   framework::ProgramDesc program;
   auto* block_ = program.Proto()->add_blocks();
   block_->set_idx(0);
@@ -101,6 +99,8 @@ TEST(TensorRTEngineOp, manual) {
   engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
   SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                     std::vector<std::string>({}));
@@ -129,8 +129,6 @@ TEST(TensorRTEngineOp, manual) {
 }
 
 void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
-  FLAGS_tensorrt_engine_batch_size = batch_size;
-  FLAGS_tensorrt_max_batch_size = batch_size;
   framework::ProgramDesc program;
   framework::Scope scope;
   platform::CUDAPlace place;
@@ -195,8 +193,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
 
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size);
-  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10);
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
   SetAttr<std::vector<std::string>>(
       engine_op_desc.Proto(), "parameters",
       std::vector<std::string>({"y0", "y1", "y2", "y3"}));
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 1b283fc9725fb8d01da913312844d0faea29daf6..dfc079e986e93c7f02f17b299e5d6293edbedd05 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -201,7 +201,6 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
   compute_capability = GetCUDAComputeCapability(place_.device);
   multi_process = GetCUDAMultiProcessors(place_.device);
   max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
-  grid_max_dims_ = GpuMaxGridDim(place_.device);
   PADDLE_ENFORCE(cudaStreamCreate(&stream_));
   eigen_stream_.reset(new EigenCudaStreamDevice());
   eigen_stream_->Reinitialize(&stream_, place);
@@ -240,10 +239,6 @@ int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
   return multi_process * max_threads_per_mp;
 }
 
-std::tuple<int, int, int> CUDADeviceContext::GetMaxGridDims() const {
-  return grid_max_dims_;
-}
-
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index da32b0dad4b8cfe75bf82f59ec58db8136b899f2..79539195157d74d4d757edee5e008cbb76c93ee2 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -13,7 +13,6 @@ limitations under the License. */
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
-#include <tuple>
 #include <unordered_map>
 #include <vector>
 
@@ -92,8 +91,6 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return the max physical thread count in the device context */
   int GetMaxPhysicalThreadCount() const;
 
-  std::tuple<int, int, int> GetMaxGridDims() const;
-
   /*! \brief  Return eigen device in the device context. */
   Eigen::GpuDevice* eigen_device() const;
 
@@ -138,8 +135,6 @@ class CUDADeviceContext : public DeviceContext {
   cudaStream_t stream_;
   cublasHandle_t cublas_handle_;
 
-  std::tuple<int, int, int> grid_max_dims_;
-
   int compute_capability;
   int multi_process;
   int max_threads_per_mp;
diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h
index 2806d726d2b1ac6b717a9041af19e7ee62be6883..c153e80fe42aecb33d3aa97874d2881bce9029be 100644
--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
@@ -48,54 +48,35 @@ __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
 }
 
 template <typename Function>
-__global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
+__global__ static void ForRangeElemwiseOp(Function func, int limit) {
   size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
   if (idx < limit) {
     func(idx);
   }
 }
 
-template <typename Function>
-__global__ static void ForRangeElemwiseOpGridLarge(Function func, size_t limit,
-                                                   int grid_dim) {
-  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
-  while (idx < limit) {
-    func(idx);
-    idx += grid_dim;
-  }
-}
-
 template <>
 struct ForRange<CUDADeviceContext> {
   ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(limit) {}
+      : dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
 
   template <typename Function>
   inline void operator()(Function func) const {
     constexpr int num_threads = 1024;
     int block_size = limit_ <= num_threads ? limit_ : num_threads;
-    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
-
-    int max_grid_dim = std::get<0>(dev_ctx_.GetMaxGridDims());
-
-    if (grid_size < max_grid_dim) {
-      int grid_size_int = static_cast<int>(grid_size);
-      if (grid_size == 1) {
-        ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
-            func);
-      } else {
-        ForRangeElemwiseOp<<<grid_size_int, block_size, 0, dev_ctx_.stream()>>>(
-            func, limit_);
-      }
+    int grid_size = (limit_ + num_threads - 1) / num_threads;
+
+    if (grid_size == 1) {
+      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
+          func);
     } else {
-      ForRangeElemwiseOpGridLarge<<<max_grid_dim, block_size, 0,
-                                    dev_ctx_.stream()>>>(func, limit_,
-                                                         max_grid_dim);
+      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
+          func, limit_);
     }
   }
 
   const CUDADeviceContext& dev_ctx_;
-  size_t limit_;
+  int limit_;
 };
 
 #endif
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index b88523728407803a1ea9d343dc2d33c6a38d5de9..126636d879213b1c8f242db8fbdf6a358a1d2da9 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -152,22 +152,5 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
   PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
                  "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
 }
-
-std::tuple<int, int, int> GpuMaxGridDim(int id) {
-  std::tuple<int, int, int> result;
-  PADDLE_ENFORCE(
-      cudaDeviceGetAttribute(&std::get<0>(result), cudaDevAttrMaxBlockDimX, id),
-      "cudaDeviceGetAttribute failed in "
-      "cudaDevAttrMaxBlockDim");
-  PADDLE_ENFORCE(
-      cudaDeviceGetAttribute(&std::get<1>(result), cudaDevAttrMaxBlockDimY, id),
-      "cudaDeviceGetAttribute failed in "
-      "cudaDevAttrMaxBlockDim");
-  PADDLE_ENFORCE(
-      cudaDeviceGetAttribute(&std::get<2>(result), cudaDevAttrMaxBlockDimZ, id),
-      "cudaDeviceGetAttribute failed in "
-      "cudaDevAttrMaxBlockDim");
-  return result;
-}
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index b748c6e8a519d27acd211f815a210c7a74ff32c8..f4640d3eaa2165c35e8e14690d83e9e7e7168c0b 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <cuda_runtime.h>
 #include <stddef.h>
 #include <string>
-#include <tuple>
 
 namespace paddle {
 namespace platform {
@@ -73,8 +72,6 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
 //! Set memory dst with value count size asynchronously
 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
 
-std::tuple<int, int, int> GpuMaxGridDim(int id);
-
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index b5bd07d401f9ebfe441bc0f84f9bad317f0e8da9..e7f634c4a622b48e97040987836406cf73cb23b6 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method)
+set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method pass_builder)
 set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
 if(NOT WIN32)
 list(APPEND PYBIND_DEPS parallel_executor profiler)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 8b62502e3f920a3bf7d80f9e7edc2f3647a0e5b1..ef2f1f2a20a2eddee8ac077ee4bbf4dfd777448d 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -595,6 +596,29 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
 
+  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
+  pass.def(py::init())
+      .def("set_str", [](ir::Pass &self, const std::string &name,
+                         const std::string &attr) {
+        self.Set<std::string>(name, new std::string(attr));
+      });
+
+  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
+      m, "PassBuilder");
+  pb.def(py::init())
+      .def("append_pass",
+           [](ir::PassBuilder &self,
+              const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
+             return self.AppendPass(pass_type);
+           })
+      .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
+      .def("insert_pass",
+           [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
+             return self.InsertPass(idx, pass_type);
+           })
+      .def("remove_pass",
+           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
+
   // -- python binds for parallel executor.
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
   py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
@@ -677,7 +701,11 @@ All parameter, weight, gradient are variables in Paddle.
                     },
                     [](BuildStrategy &self, bool b) {
                       self.fuse_elewise_add_act_ops_ = b;
-                    });
+                    })
+      .def("_create_passes_from_strategy",
+           [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
+             return self.CreatePassesFromStrategy();
+           });
 
   pe.def(py::init<const std::vector<platform::Place> &,
                   const std::unordered_set<std::string> &,
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index e8d2e8e6861b95c94077a72f8e4f7d11b0ab11be..86f2b5d426175947871a338fa4562eb0144ea138 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -70,8 +70,8 @@ function cmake_gen() {
     PYTHON_FLAGS=""
     SYSTEM=`uname -s`
     if [ "$SYSTEM" == "Darwin" ]; then
+        echo "Using python abi: $1"
         if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then
-            echo "using python abi: $1"
             if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then
                 export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
                 export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
@@ -82,7 +82,18 @@ function cmake_gen() {
             else
                 exit 1
             fi
-        # TODO: qiyang add python3 part here 
+        elif [ "$1" == "cp35-cp35m" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.5" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
+                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+            else
+                exit 1
+            fi
         fi
     else 
         if [ "$1" != "" ]; then
@@ -629,10 +640,10 @@ EOF
 
 function gen_capi_package() {
     if [[ ${WITH_C_API} == "ON" ]]; then
-        install_prefix="${PADDLE_ROOT}/build/capi_output"
-        rm -rf $install_prefix
-        make DESTDIR="$install_prefix" install
-        cd $install_prefix/usr/local
+        capi_install_prefix=${INSTALL_PREFIX:-/paddle/build}/capi_output
+        rm -rf $capi_install_prefix
+        make DESTDIR="$capi_install_prefix" install
+        cd $capi_install_prefix/
         ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz
     fi
 }
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index ece4046f5b7a7eff5be724d6f890665be7f3344e..58a4c66c206c3f783437126c855c2890644f1bc0 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -77,13 +77,14 @@ def download(url, module_name, md5sum, save_name=None):
     retry_limit = 3
     while not (os.path.exists(filename) and md5file(filename) == md5sum):
         if os.path.exists(filename):
-            print("file md5", md5file(filename), md5sum)
+            sys.stderr.write("file %s  md5 %s" % (md5file(filename), md5sum))
         if retry < retry_limit:
             retry += 1
         else:
             raise RuntimeError("Cannot download {0} within retry limit {1}".
                                format(url, retry_limit))
-        print("Cache file %s not found, downloading %s" % (filename, url))
+        sys.stderr.write("Cache file %s not found, downloading %s" %
+                         (filename, url))
         r = requests.get(url, stream=True)
         total_length = r.headers.get('content-length')
 
@@ -100,10 +101,11 @@ def download(url, module_name, md5sum, save_name=None):
                     dl += len(data)
                     f.write(data)
                     done = int(50 * dl / total_length)
-                    sys.stdout.write("\r[%s%s]" % ('=' * done,
+                    sys.stderr.write("\r[%s%s]" % ('=' * done,
                                                    ' ' * (50 - done)))
                     sys.stdout.flush()
-
+    sys.stderr.write("\n")
+    sys.stdout.flush()
     return filename
 
 
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index 5607f11932bbe6aff548be316dc39b4636e079f4..c82bc0b940a32bf584e87646442c2507864c2285 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -18,5 +18,10 @@ from . import decoder
 from .decoder import *
 from . import memory_usage_calc
 from .memory_usage_calc import *
+from . import op_frequence
+from .op_frequence import *
 
-__all__ = decoder.__all__ + memory_usage_calc.__all__
+__all__ = []
+__all__ += decoder.__all__
+__all__ += memory_usage_calc.__all__
+__all__ += op_frequence.__all__
diff --git a/python/paddle/fluid/contrib/op_frequence.py b/python/paddle/fluid/contrib/op_frequence.py
new file mode 100644
index 0000000000000000000000000000000000000000..68dd0a946b4b69d47d51dce3de25ce147198f09a
--- /dev/null
+++ b/python/paddle/fluid/contrib/op_frequence.py
@@ -0,0 +1,104 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from collections import OrderedDict
+
+from ..framework import Program
+
+__all__ = ['op_freq_statistic']
+
+
+def op_freq_statistic(program):
+    """
+    Statistics of Op frequency.
+
+    Args:
+        program(Program): The current Program.
+
+    Returns:
+        uni_op_freq(dict): the single op frequency.
+        adj_2_op_freq(dict): the two adjacent ops frequency.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> uni_op_freq, adj_2_op_freq = fluid.contrib.op_freq_statistic(
+        >>>        fluid.default_main_program())
+        >>> for op_type, op_num in uni_op_freq:
+        >>>     print("%s  \t  %d" % (op_type, op_num))
+        >>> for op_type, op_num in adj_2_op_freq:
+        >>>     print("%s  \t  %d" % (op_type, op_num))
+
+    """
+
+    if not isinstance(program, Program):
+        raise TypeError("The input type should be Porgram."
+                        "But you passed in %s" % (type(program)))
+
+    uni_op_freq = OrderedDict()
+    adj_2_op_freq = OrderedDict()
+    op_in_ops = OrderedDict()
+
+    parameters = [p.name for p in program.blocks[0].all_parameters()]
+
+    # get uni_op_freq
+    for op in program.global_block().ops:
+        had_recorded = False
+        for var_name in op.output_arg_names:
+            if var_name in parameters:
+                continue
+            if not had_recorded and uni_op_freq.has_key(op.type):
+                uni_op_freq[op.type] += 1
+                had_recorded = True
+            elif not had_recorded:
+                uni_op_freq[op.type] = 1
+                had_recorded = True
+
+    # get adj_2_op_freq
+    var_gen_op = {}
+    for op in program.global_block().ops:
+        for var_name in op.input_arg_names:
+            if var_name in parameters:
+                continue
+            if var_gen_op.has_key(var_name):
+                assert len(var_gen_op[var_name]) > 0
+                if op_in_ops.has_key(op.type):
+                    op_in_ops[op.type].append(var_gen_op[var_name][-1])
+                else:
+                    op_in_ops[op.type] = [var_gen_op[var_name][-1]]
+            else:
+                print("Var's generate op is not found,%s, %s" %
+                      (var_name, op.type))
+
+        for var_name in op.output_arg_names:
+            if var_gen_op.has_key(var_name):
+                var_gen_op[var_name].append(op.type)
+            else:
+                var_gen_op[var_name] = [op.type]
+
+    for op, in_ops in op_in_ops.iteritems():
+        for in_op in in_ops:
+            op_op = in_op + "->" + op
+            if adj_2_op_freq.has_key(op_op):
+                adj_2_op_freq[op_op] += 1
+            else:
+                adj_2_op_freq[op_op] = 1
+
+    uni_op_freq = sorted(
+        uni_op_freq.items(), key=lambda item: item[1], reverse=True)
+    adj_2_op_freq = sorted(
+        adj_2_op_freq.items(), key=lambda item: item[1], reverse=True)
+
+    return uni_op_freq, adj_2_op_freq
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 574d0d727cba9fa9de0cffbe116f71b9e65a7092..9772c65738a2c5373f657164e3bc379404ba642e 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -284,7 +284,7 @@ def detection_output(loc,
         target_box=loc,
         code_type='decode_center_size')
     compile_shape = scores.shape
-    run_shape = ops.shape(scores)
+    run_shape = nn.shape(scores)
     scores = nn.flatten(x=scores, axis=2)
     scores = nn.softmax(input=scores)
     scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape)
@@ -697,7 +697,7 @@ def ssd_loss(location,
         raise ValueError("Only support mining_type == max_negative now.")
 
     num, num_prior, num_class = confidence.shape
-    conf_shape = ops.shape(confidence)
+    conf_shape = nn.shape(confidence)
 
     def __reshape_to_2d(var):
         return nn.flatten(x=var, axis=2)
@@ -724,7 +724,7 @@ def ssd_loss(location,
     target_label.stop_gradient = True
     conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
     # 3. Mining hard examples
-    actual_shape = ops.slice(conf_shape, axes=[0], starts=[0], ends=[2])
+    actual_shape = nn.slice(conf_shape, axes=[0], starts=[0], ends=[2])
     actual_shape.stop_gradient = True
     conf_loss = nn.reshape(
         x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape)
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 75c29b12724d53783b9748d6df066c52bd232482..d56fa76300e7054ef71a7729483a579fa35f1dac 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -311,7 +311,6 @@ def _copy_reader_var_(block, var):
     new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER)
     new_var.desc.set_shapes(var.desc.shapes())
     new_var.desc.set_dtypes(var.desc.dtypes())
-    new_var.desc.set_lod_levels(var.desc.lod_levels())
     new_var.persistable = True
     return new_var
 
@@ -633,7 +632,6 @@ def py_reader(capacity,
         })
 
     startup_var.desc.set_dtypes(dtypes)
-    startup_var.desc.set_lod_levels(lod_levels)
     startup_var.persistable = True
 
     main_prog_var = _copy_reader_var_(default_main_program().current_block(),
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index b1598bfec210474ae1e17f9f88e8b57aa80b8452..a3064b565d096f7feda18379c66ffc8bf2f4a55c 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -78,7 +78,12 @@ def accuracy(input, label, k=1, correct=None, total=None):
     return acc_out
 
 
-def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
+def auc(input,
+        label,
+        curve='ROC',
+        num_thresholds=2**12 - 1,
+        topk=1,
+        slide_steps=1):
     """
     **Area Under the Curve (AUC) Layer**
 
@@ -105,6 +110,8 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
         num_thresholds(int): The number of thresholds to use when discretizing
                              the roc curve. Default 200.
         topk(int): only topk number of prediction output will be used for auc.
+        slide_steps: when calc batch auc, we can not only use step currently but the previous steps can be used. slide_steps=1 means use the current step, slide_steps=3 means use current step and the previous second steps, slide_steps=0 use all of the steps.
+
 
     Returns:
         Variable: A scalar representing the current AUC.
@@ -120,16 +127,48 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
     auc_out = helper.create_tmp_variable(dtype="float64")
     batch_auc_out = helper.create_tmp_variable(dtype="float64")
     # make tp, tn, fp, fn persistable, so that can accumulate all batches.
+
+    # for batch auc
+    batch_stat_pos = helper.create_global_variable(
+        persistable=True,
+        dtype='int64',
+        shape=[slide_steps, num_thresholds + 1])
+    batch_stat_neg = helper.create_global_variable(
+        persistable=True,
+        dtype='int64',
+        shape=[slide_steps, num_thresholds + 1])
+
+    # for global auc
     stat_pos = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
     stat_neg = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
 
-    for var in [stat_pos, stat_neg]:
+    for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
         helper.set_variable_initializer(
             var, Constant(
                 value=0.0, force_cpu=True))
 
+    # Batch AUC
+    helper.append_op(
+        type="auc",
+        inputs={
+            "Predict": [input],
+            "Label": [label],
+            "StatPos": [batch_stat_pos],
+            "StatNeg": [batch_stat_neg]
+        },
+        attrs={
+            "curve": curve,
+            "num_thresholds": num_thresholds,
+            "slide_steps": slide_steps
+        },
+        outputs={
+            "AUC": [batch_auc_out],
+            "StatPosOut": [batch_stat_pos],
+            "StatNegOut": [batch_stat_neg]
+        })
+    # Global AUC
     helper.append_op(
         type="auc",
         inputs={
@@ -138,12 +177,16 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
             "StatPos": [stat_pos],
             "StatNeg": [stat_neg]
         },
-        attrs={"curve": curve,
-               "num_thresholds": num_thresholds},
+        attrs={
+            "curve": curve,
+            "num_thresholds": num_thresholds,
+            "slide_steps": 0
+        },
         outputs={
             "AUC": [auc_out],
-            "BatchAUC": [batch_auc_out],
             "StatPosOut": [stat_pos],
             "StatNegOut": [stat_neg]
         })
-    return auc_out, batch_auc_out, [stat_pos, stat_neg]
+    return auc_out, batch_auc_out, [
+        batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
+    ]
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 6e0f3de4141aff3efb98b6d1162823a13f83a7bb..2fbf44977c60be799419b5c329939130f605e221 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -29,110 +29,29 @@ from .. import unique_name
 from functools import reduce
 
 __all__ = [
-    'fc',
-    'embedding',
-    'dynamic_lstm',
-    'dynamic_lstmp',
-    'dynamic_gru',
-    'gru_unit',
-    'linear_chain_crf',
-    'crf_decoding',
-    'cos_sim',
-    'cross_entropy',
-    'square_error_cost',
-    'chunk_eval',
-    'sequence_conv',
-    'conv2d',
-    'conv3d',
-    'sequence_pool',
-    'sequence_softmax',
-    'softmax',
-    'pool2d',
-    'pool3d',
-    'batch_norm',
-    'beam_search_decode',
-    'conv2d_transpose',
-    'conv3d_transpose',
-    'sequence_expand',
-    'sequence_expand_as',
-    'sequence_pad',
-    'lstm_unit',
-    'reduce_sum',
-    'reduce_mean',
-    'reduce_max',
-    'reduce_min',
-    'reduce_prod',
-    'sequence_first_step',
-    'sequence_last_step',
-    'dropout',
-    'split',
-    'ctc_greedy_decoder',
-    'edit_distance',
-    'l2_normalize',
-    'matmul',
-    'topk',
-    'warpctc',
-    'sequence_reshape',
-    'transpose',
-    'im2sequence',
-    'nce',
-    'hsigmoid',
-    'beam_search',
-    'row_conv',
-    'multiplex',
-    'layer_norm',
-    'softmax_with_cross_entropy',
-    'smooth_l1',
-    'one_hot',
-    'autoincreased_step_counter',
-    'reshape',
-    'squeeze',
-    'unsqueeze',
-    'lod_reset',
-    'lrn',
-    'pad',
-    'pad_constant_like',
-    'label_smooth',
-    'roi_pool',
-    'dice_loss',
-    'image_resize',
-    'image_resize_short',
-    'resize_bilinear',
-    'gather',
-    'scatter',
-    'sequence_scatter',
-    'random_crop',
-    'mean_iou',
-    'relu',
-    'log',
-    'crop',
-    'rank_loss',
-    'elu',
-    'relu6',
-    'pow',
-    'stanh',
-    'hard_sigmoid',
-    'swish',
-    'prelu',
-    'brelu',
-    'leaky_relu',
-    'soft_relu',
-    'flatten',
-    'sequence_mask',
-    'stack',
-    'pad2d',
-    'unstack',
-    'sequence_enumerate',
-    'expand',
-    'sequence_concat',
-    'scale',
-    'elementwise_add',
-    'elementwise_div',
-    'elementwise_sub',
-    'elementwise_mul',
-    'elementwise_max',
-    'elementwise_min',
-    'elementwise_pow',
+    'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru',
+    'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy',
+    'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d', 'conv3d',
+    'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'pool3d',
+    'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'conv3d_transpose',
+    'sequence_expand', 'sequence_expand_as', 'sequence_pad', 'lstm_unit',
+    'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod',
+    'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
+    'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk',
+    'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce',
+    'hsigmoid', 'beam_search', 'row_conv', 'multiplex', 'layer_norm',
+    'softmax_with_cross_entropy', 'smooth_l1', 'one_hot',
+    'autoincreased_step_counter', 'reshape', 'squeeze', 'unsqueeze',
+    'lod_reset', 'lrn', 'pad', 'pad_constant_like', 'label_smooth', 'roi_pool',
+    'dice_loss', 'image_resize', 'image_resize_short', 'resize_bilinear',
+    'gather', 'scatter', 'sequence_scatter', 'random_crop', 'mean_iou', 'relu',
+    'log', 'crop', 'rank_loss', 'elu', 'relu6', 'pow', 'stanh', 'hard_sigmoid',
+    'swish', 'prelu', 'brelu', 'leaky_relu', 'soft_relu', 'flatten',
+    'sequence_mask', 'stack', 'pad2d', 'unstack', 'sequence_enumerate',
+    'expand', 'sequence_concat', 'scale', 'elementwise_add', 'elementwise_div',
+    'elementwise_sub', 'elementwise_mul', 'elementwise_max', 'elementwise_min',
+    'elementwise_pow', 'uniform_random_batch_size_like', 'gaussian_random',
+    'sampling_id', 'gaussian_random_batch_size_like', 'sum', 'slice', 'shape'
 ]
 
 
@@ -6463,6 +6382,246 @@ def expand(x, expand_times, name=None):
     return out
 
 
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+
+@templatedoc()
+def uniform_random_batch_size_like(input,
+                                   shape,
+                                   dtype='float32',
+                                   input_dim_idx=0,
+                                   output_dim_idx=0,
+                                   min=-1.0,
+                                   max=1.0,
+                                   seed=0):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${input_comment}
+        shape (tuple|list): ${shape_comment}
+        input_dim_idx (Int): ${input_dim_idx_comment}
+        output_dim_idx (Int): ${output_dim_idx_comment}
+        min (Float): ${min_comment}
+        max (Float): ${max_comment}
+        seed (Int): ${seed_comment}
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
+    Returns:
+        out (Variable): ${out_comment}
+
+    """
+
+    helper = LayerHelper('uniform_random_batch_size_like', **locals())
+    out = helper.create_tmp_variable(dtype)
+    c_dtype = convert_np_dtype_to_dtype_(dtype)
+    helper.append_op(
+        type='uniform_random_batch_size_like',
+        inputs={'Input': input},
+        outputs={'Out': out},
+        attrs={
+            'shape': shape,
+            'input_dim_idx': input_dim_idx,
+            'output_dim_idx': output_dim_idx,
+            'min': min,
+            'max': max,
+            'seed': seed,
+            'dtype': c_dtype
+        })
+
+    return out
+
+
+@templatedoc()
+def gaussian_random(shape,
+                    mean=0.0,
+                    std=1.0,
+                    seed=0,
+                    dtype='float32',
+                    use_mkldnn=False):
+    """
+    ${comment}
+
+    Args:
+        shape (tuple|list): ${shape_comment}
+        mean (Float): ${mean_comment}
+        std (Float): ${std_comment}
+        seed (Int): ${seed_comment}
+        dtype(np.dtype|core.VarDesc.VarType|str): Output data type.
+        use_mkldnn (Bool): Only used in mkldnn kernel.
+
+    Returns:
+        out (Variable): ${out_comment}
+
+    """
+
+    helper = LayerHelper('gaussian_random', **locals())
+    out = helper.create_tmp_variable(dtype)
+    c_dtype = convert_np_dtype_to_dtype_(dtype)
+    helper.append_op(
+        type='gaussian_random',
+        outputs={'Out': out},
+        attrs={
+            'shape': shape,
+            'mean': mean,
+            'std': std,
+            'seed': seed,
+            'dtype': c_dtype,
+            'use_mkldnn': use_mkldnn
+        })
+
+    return out
+
+
+@templatedoc()
+def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
+    """
+    ${comment}
+
+    Args:
+        x (Variable): ${x_comment}
+        min (Float): ${min_comment}
+        max (Float): ${max_comment}
+        seed (Float): ${seed_comment}
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
+
+    Returns:
+        out (Variable): ${out_comment}
+
+    """
+
+    helper = LayerHelper('sampling_id', **locals())
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='sampling_id',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'min': min,
+               'max': max,
+               'seed': seed})
+
+    return out
+
+
+@templatedoc()
+def gaussian_random_batch_size_like(input,
+                                    shape,
+                                    input_dim_idx=0,
+                                    output_dim_idx=0,
+                                    mean=0.0,
+                                    std=1.0,
+                                    seed=0,
+                                    dtype='float32'):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${input_comment}
+        shape (tuple|list): ${shape_comment}
+        input_dim_idx (Int): ${input_dim_idx_comment}
+        output_dim_idx (Int): ${output_dim_idx_comment}
+        mean (Float): ${mean_comment}
+        std (Float): ${std_comment}
+        seed (Int): ${seed_comment}
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
+
+    Returns:
+        out (Variable): ${out_comment}
+    """
+
+    helper = LayerHelper('gaussian_random_batch_size_like', **locals())
+    out = helper.create_tmp_variable(dtype)
+    c_dtype = convert_np_dtype_to_dtype_(dtype)
+    helper.append_op(
+        type='gaussian_random_batch_size_like',
+        inputs={'Input': input},
+        outputs={'Out': out},
+        attrs={
+            'shape': shape,
+            'input_dim_idx': input_dim_idx,
+            'output_dim_idx': output_dim_idx,
+            'mean': mean,
+            'std': std,
+            'seed': seed,
+            'dtype': c_dtype
+        })
+
+    return out
+
+
+@templatedoc()
+def sum(x, use_mkldnn=False):
+    """
+    ${comment}
+
+    Args:
+        x (Variable): ${x_comment}
+        use_mkldnn (Bool): ${use_mkldnn_comment}
+
+    Returns:
+        out (Variable): ${out_comment}
+    """
+
+    helper = LayerHelper('sum', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype('x'))
+    helper.append_op(
+        type='sum',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'use_mkldnn': use_mkldnn})
+
+    return out
+
+
+@templatedoc()
+def slice(input, axes, starts, ends):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${input_comment}.
+        axes (List): ${axes_comment}
+        starts (List): ${starts_comment}
+        ends (List): ${ends_comment}
+
+    Returns:
+        out (Variable): ${out_comment}
+
+    """
+
+    helper = LayerHelper('slice', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype('input'))
+    helper.append_op(
+        type='slice',
+        inputs={'Input': input},
+        outputs={'Out': out},
+        attrs={'axes': axes,
+               'starts': starts,
+               'ends': ends})
+
+    return out
+
+
+@templatedoc()
+def shape(input):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${input_comment}
+
+    Returns:
+        out (Variable): ${out_comment}
+
+    """
+
+    helper = LayerHelper('shape', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype('input'))
+    helper.append_op(
+        type='shape', inputs={'Input': input}, outputs={'Out': out})
+
+    return out
+
+
 def _elementwise_op(helper):
     op_type = helper.layer_type
     x = helper.kwargs.get('x', None)
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 363e9200bedfd9cc843513c2db16eadb5ffa33b4..220d065f8f1cc02508dea2679820e1f7f490866d 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -45,13 +45,6 @@ __all__ = [
     'logical_or',
     'logical_xor',
     'logical_not',
-    'uniform_random_batch_size_like',
-    'gaussian_random',
-    'sampling_id',
-    'gaussian_random_batch_size_like',
-    'sum',
-    'slice',
-    'shape',
     'maxout',
 ]
 
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py
new file mode 100644
index 0000000000000000000000000000000000000000..902dc6544ed6858c4cd8d64b14d6af2367059091
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
@@ -0,0 +1,109 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+
+import dist_ctr_reader
+from test_dist_base import TestDistRunnerBase, runtime_main
+
+IS_SPARSE = True
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+class TestDistCTR2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta()
+        """ network definition """
+        dnn_data = fluid.layers.data(
+            name="dnn_data",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        lr_data = fluid.layers.data(
+            name="lr_data",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        label = fluid.layers.data(
+            name="click",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=0,
+            append_batch_size=False)
+
+        # build dnn model
+        dnn_layer_dims = [128, 64, 32, 1]
+        dnn_embedding = fluid.layers.embedding(
+            is_distributed=False,
+            input=dnn_data,
+            size=[dnn_input_dim, dnn_layer_dims[0]],
+            param_attr=fluid.ParamAttr(
+                name="deep_embedding",
+                initializer=fluid.initializer.Constant(value=0.01)),
+            is_sparse=IS_SPARSE)
+        dnn_pool = fluid.layers.sequence_pool(
+            input=dnn_embedding, pool_type="sum")
+        dnn_out = dnn_pool
+        for i, dim in enumerate(dnn_layer_dims[1:]):
+            fc = fluid.layers.fc(
+                input=dnn_out,
+                size=dim,
+                act="relu",
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Constant(value=0.01)),
+                name='dnn-fc-%d' % i)
+            dnn_out = fc
+
+        # build lr model
+        lr_embbding = fluid.layers.embedding(
+            is_distributed=False,
+            input=lr_data,
+            size=[lr_input_dim, 1],
+            param_attr=fluid.ParamAttr(
+                name="wide_embedding",
+                initializer=fluid.initializer.Constant(value=0.01)),
+            is_sparse=IS_SPARSE)
+        lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum")
+
+        merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)
+
+        predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
+        acc = fluid.layers.accuracy(input=predict, label=label)
+        auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
+                                                              label=label)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        inference_program = paddle.fluid.default_main_program().clone()
+
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+        sgd_optimizer.minimize(avg_cost)
+
+        dataset = dist_ctr_reader.Dataset()
+        train_reader = paddle.batch(dataset.train(), batch_size=batch_size)
+        test_reader = paddle.batch(dataset.test(), batch_size=batch_size)
+
+        return inference_program, avg_cost, train_reader, test_reader, None, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistCTR2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..95e39d891f7e6a3dcb57540bd96fe70027443cda
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import paddle
+import tarfile
+
+logging.basicConfig()
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+
+DATA_URL = "http://paddle-ctr-data.cdn.bcebos.com/avazu_ctr_data.tgz"
+DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e"
+"""
+avazu_ctr_data/train.txt
+avazu_ctr_data/infer.txt
+avazu_ctr_data/test.txt
+avazu_ctr_data/data.meta.txt
+"""
+
+
+def read_data(file_name):
+    path = paddle.dataset.common.download(DATA_URL, "avazu_ctr_data", DATA_MD5)
+    tar = tarfile.open(path, "r:gz")
+    tar_info = None
+    for member in tar.getmembers():
+        if member.name.endswith(file_name):
+            tar_info = member
+    f = tar.extractfile(tar_info)
+    ret_lines = [_.decode('utf-8') for _ in f.readlines()]
+    return ret_lines
+
+
+class TaskMode:
+    TRAIN_MODE = 0
+    TEST_MODE = 1
+    INFER_MODE = 2
+
+    def __init__(self, mode):
+        self.mode = mode
+
+    def is_train(self):
+        return self.mode == self.TRAIN_MODE
+
+    def is_test(self):
+        return self.mode == self.TEST_MODE
+
+    def is_infer(self):
+        return self.mode == self.INFER_MODE
+
+    @staticmethod
+    def create_train():
+        return TaskMode(TaskMode.TRAIN_MODE)
+
+    @staticmethod
+    def create_test():
+        return TaskMode(TaskMode.TEST_MODE)
+
+    @staticmethod
+    def create_infer():
+        return TaskMode(TaskMode.INFER_MODE)
+
+
+class ModelType:
+    CLASSIFICATION = 0
+    REGRESSION = 1
+
+    def __init__(self, mode):
+        self.mode = mode
+
+    def is_classification(self):
+        return self.mode == self.CLASSIFICATION
+
+    def is_regression(self):
+        return self.mode == self.REGRESSION
+
+    @staticmethod
+    def create_classification():
+        return ModelType(ModelType.CLASSIFICATION)
+
+    @staticmethod
+    def create_regression():
+        return ModelType(ModelType.REGRESSION)
+
+
+def load_dnn_input_record(sent):
+    return list(map(int, sent.split()))
+
+
+def load_lr_input_record(sent):
+    res = []
+    for _ in [x.split(':') for x in sent.split()]:
+        res.append(int(_[0]))
+    return res
+
+
+feeding_index = {'dnn_input': 0, 'lr_input': 1, 'click': 2}
+
+
+class Dataset(object):
+    def train(self):
+        '''
+        Load trainset.
+        '''
+        file_name = "train.txt"
+        logger.info("load trainset from %s" % file_name)
+        mode = TaskMode.create_train()
+        return self._parse_creator(file_name, mode)
+
+    def test(self):
+        '''
+        Load testset.
+        '''
+        file_name = "test.txt"
+        logger.info("load testset from %s" % file_name)
+        mode = TaskMode.create_test()
+        return self._parse_creator(file_name, mode)
+
+    def infer(self):
+        '''
+        Load infer set.
+        '''
+        file_name = "infer.txt"
+        logger.info("load inferset from %s" % file_name)
+        mode = TaskMode.create_infer()
+        return self._parse_creator(file_name, mode)
+
+    def _parse_creator(self, file_name, mode):
+        '''
+        Parse dataset.
+        '''
+
+        def _parse():
+            data = read_data(file_name)
+            for line_id, line in enumerate(data):
+                fs = line.strip().split('\t')
+                dnn_input = load_dnn_input_record(fs[0])
+                lr_input = load_lr_input_record(fs[1])
+                if not mode.is_infer():
+                    click = int(fs[2])
+                    yield [dnn_input, lr_input, click]
+                else:
+                    yield [dnn_input, lr_input]
+
+        return _parse
+
+
+def load_data_meta():
+    '''
+    load data meta info from path, return (dnn_input_dim, lr_input_dim)
+    '''
+    lines = read_data('data.meta.txt')
+    err_info = "wrong meta format"
+    assert len(lines) == 2, err_info
+    assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[
+        1], err_info
+    res = map(int, [_.split(':')[1] for _ in lines])
+    res = list(res)
+    logger.info('dnn input dim: %d' % res[0])
+    logger.info('lr input dim: %d' % res[1])
+    return res
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index 85a96c0b53f6bc08687965048d6251265055a6fe..877d21ae882ab4efb49beb6a846ab71a22c2aab7 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -47,7 +47,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.3)))
+            value=0.01)))
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
@@ -56,7 +56,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.2)))
+            value=0.01)))
 
     SIZE = 10
     input_shape = conv_pool_2.shape
@@ -68,7 +68,7 @@ def cnn_model(data):
         size=SIZE,
         act="softmax",
         param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.1)))
+            initializer=fluid.initializer.Constant(value=0.01)))
     return predict
 
 
diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
new file mode 100644
index 0000000000000000000000000000000000000000..6456d1b53a129db04ace7ff4413a3d76e922ccde
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
@@ -0,0 +1,238 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+import random
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+
+DTYPE = "int64"
+DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000'
+DATA_MD5 = '24e49366eb0611c552667989de2f57d5'
+
+# For Net
+base_lr = 0.2
+emb_lr = base_lr * 3
+dict_dim = 1500
+emb_dim = 128
+hid_dim = 128
+margin = 0.1
+sample_rate = 1
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+def get_acc(cos_q_nt, cos_q_pt, batch_size):
+    cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
+    cond = fluid.layers.cast(cond, dtype='float64')
+    cond_3 = fluid.layers.reduce_sum(cond)
+    acc = fluid.layers.elementwise_div(
+        cond_3,
+        fluid.layers.fill_constant(
+            shape=[1], value=batch_size * 1.0, dtype='float64'),
+        name="simnet_acc")
+    return acc
+
+
+def get_loss(cos_q_pt, cos_q_nt):
+    loss_op1 = fluid.layers.elementwise_sub(
+        fluid.layers.fill_constant_batch_size_like(
+            input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'),
+        cos_q_pt)
+    loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
+    loss_op3 = fluid.layers.elementwise_max(
+        fluid.layers.fill_constant_batch_size_like(
+            input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+        loss_op2)
+    avg_cost = fluid.layers.mean(loss_op3)
+    return avg_cost
+
+
+def get_optimizer():
+    # SGD optimizer
+    optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
+    return optimizer
+
+
+def train_network(batch_size, is_distributed=False, is_sparse=False):
+    # query
+    q = fluid.layers.data(
+        name="query_ids", shape=[1], dtype="int64", lod_level=1)
+    ## embedding
+    q_emb = fluid.layers.embedding(
+        input=q,
+        is_distributed=is_distributed,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__emb__",
+            learning_rate=emb_lr),
+        is_sparse=is_sparse)
+    ## vsum
+    q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
+    q_ss = fluid.layers.softsign(q_sum)
+    ## fc layer after conv
+    q_fc = fluid.layers.fc(
+        input=q_ss,
+        size=hid_dim,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__q_fc__",
+            learning_rate=base_lr))
+    # label data
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    # pt
+    pt = fluid.layers.data(
+        name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+    ## embedding
+    pt_emb = fluid.layers.embedding(
+        input=pt,
+        is_distributed=is_distributed,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__emb__",
+            learning_rate=emb_lr),
+        is_sparse=is_sparse)
+    ## vsum
+    pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
+    pt_ss = fluid.layers.softsign(pt_sum)
+    ## fc layer
+    pt_fc = fluid.layers.fc(
+        input=pt_ss,
+        size=hid_dim,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__fc__",
+            learning_rate=base_lr),
+        bias_attr=fluid.ParamAttr(name="__fc_b__"))
+    # nt
+    nt = fluid.layers.data(
+        name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+    ## embedding
+    nt_emb = fluid.layers.embedding(
+        input=nt,
+        is_distributed=is_distributed,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__emb__",
+            learning_rate=emb_lr),
+        is_sparse=is_sparse)
+    ## vsum
+    nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
+    nt_ss = fluid.layers.softsign(nt_sum)
+    ## fc layer
+    nt_fc = fluid.layers.fc(
+        input=nt_ss,
+        size=hid_dim,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01),
+            name="__fc__",
+            learning_rate=base_lr),
+        bias_attr=fluid.ParamAttr(name="__fc_b__"))
+    cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
+    cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
+    # loss
+    avg_cost = get_loss(cos_q_pt, cos_q_nt)
+    # acc
+    acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
+    return [avg_cost, acc, cos_q_pt]
+
+
+def combination(x, y):
+    res = [[[xi, yi] for yi in y] for xi in x]
+    return res[0]
+
+
+def get_one_data(file_list):
+    for file in file_list:
+        contents = []
+        with open(file, "r") as fin:
+            for i in fin:
+                contents.append(i.strip())
+            for index, q in enumerate(contents):
+                try:
+                    one_data = [[int(j) for j in i.split(" ")]
+                                for i in q.split(";")[:-1]]
+                    if one_data[1][0] + one_data[1][1] != len(one_data) - 3:
+                        q = fin.readline()
+                        continue
+                    tmp = combination(one_data[3:3 + one_data[1][0]],
+                                      one_data[3 + one_data[1][0]:])
+                except Exception as e:
+                    continue
+
+                for each in tmp:
+                    yield [one_data[2], 0, each[0], each[1]]
+
+
+def get_batch_reader(file_list, batch_size):
+    def batch_reader():
+        res = []
+        for i in get_one_data(file_list):
+            if random.random() <= sample_rate:
+                res.append(i)
+            if len(res) >= batch_size:
+                yield res
+                res = []
+
+    return batch_reader
+
+
+def get_train_reader(batch_size):
+    # The training data set.
+    train_file = os.path.join(paddle.dataset.common.DATA_HOME, "simnet",
+                              "train")
+    train_reader = get_batch_reader([train_file], batch_size)
+    train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"]
+    return train_reader, train_feed
+
+
+class TestDistSimnetBow2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        # Train program
+        avg_cost, acc, predict = \
+            train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"])))
+
+        inference_program = fluid.default_main_program().clone()
+
+        # Optimization
+        opt = get_optimizer()
+        opt.minimize(avg_cost)
+
+        # Reader
+        train_reader, _ = get_train_reader(batch_size)
+        return inference_program, avg_cost, train_reader, train_reader, acc, predict
+
+
+if __name__ == "__main__":
+    paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
+    runtime_main(TestDistSimnetBow2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_text_classification.py b/python/paddle/fluid/tests/unittests/dist_text_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..095a474fd3ac056c678f9051ed80ef363ae968c9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py
@@ -0,0 +1,231 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+import six
+import tarfile
+import string
+import re
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+
+DTYPE = "float32"
+VOCAB_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/imdb.vocab'
+VOCAB_MD5 = '23c86a0533c0151b6f12fa52b106dcc2'
+DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/text_classification.tar.gz'
+DATA_MD5 = '29ebfc94f11aea9362bbb7f5e9d86b8a'
+
+
+# Load dictionary.
+def load_vocab(filename):
+    vocab = {}
+    if six.PY2:
+        with open(filename, 'r') as f:
+            for idx, line in enumerate(f):
+                vocab[line.strip()] = idx
+    else:
+        with open(filename, 'r', encoding="utf-8") as f:
+            for idx, line in enumerate(f):
+                vocab[line.strip()] = idx
+    return vocab
+
+
+def get_worddict(dict_path):
+    word_dict = load_vocab(dict_path)
+    word_dict["<unk>"] = len(word_dict)
+    dict_dim = len(word_dict)
+    return word_dict, dict_dim
+
+
+def conv_net(input,
+             dict_dim,
+             emb_dim=128,
+             window_size=3,
+             num_filters=128,
+             fc0_dim=96,
+             class_dim=2):
+    emb = fluid.layers.embedding(
+        input=input,
+        size=[dict_dim, emb_dim],
+        is_sparse=False,
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=num_filters,
+        filter_size=window_size,
+        act="tanh",
+        pool_type="max",
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01)))
+
+    fc_0 = fluid.layers.fc(
+        input=[conv_3],
+        size=fc0_dim,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01)))
+
+    prediction = fluid.layers.fc(
+        input=[fc_0],
+        size=class_dim,
+        act="softmax",
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01)))
+
+    return prediction
+
+
+def inference_network(dict_dim):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    out = conv_net(data, dict_dim)
+    return out
+
+
+def get_reader(word_dict, batch_size):
+    # The training data set.
+    train_reader = paddle.batch(train(word_dict), batch_size=batch_size)
+
+    # The testing data set.
+    test_reader = paddle.batch(test(word_dict), batch_size=batch_size)
+
+    return train_reader, test_reader
+
+
+def get_optimizer(learning_rate):
+    optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
+    return optimizer
+
+
+class TestDistTextClassification2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        vocab = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "text_classification", "imdb.vocab")
+        word_dict, dict_dim = get_worddict(vocab)
+
+        # Input data
+        data = fluid.layers.data(
+            name="words", shape=[1], dtype="int64", lod_level=1)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        # Train program
+        predict = conv_net(data, dict_dim)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=predict, label=label)
+        inference_program = fluid.default_main_program().clone()
+
+        # Optimization
+        opt = get_optimizer(learning_rate=0.001)
+        opt.minimize(avg_cost)
+
+        # Reader
+        train_reader, test_reader = get_reader(word_dict, batch_size)
+
+        return inference_program, avg_cost, train_reader, test_reader, acc, predict
+
+
+def tokenize(pattern):
+    """
+    Read files that match the given pattern.  Tokenize and yield each file.
+    """
+
+    with tarfile.open(
+            paddle.dataset.common.download(DATA_URL, 'text_classification',
+                                           DATA_MD5)) as tarf:
+        # Note that we should use tarfile.next(), which does
+        # sequential access of member files, other than
+        # tarfile.extractfile, which does random access and might
+        # destroy hard disks.
+        tf = tarf.next()
+        while tf != None:
+            if bool(pattern.match(tf.name)):
+                # newline and punctuations removal and ad-hoc tokenization.
+                yield tarf.extractfile(tf).read().rstrip(six.b(
+                    "\n\r")).translate(
+                        None, six.b(string.punctuation)).lower().split()
+            tf = tarf.next()
+
+
+def reader_creator(pos_pattern, neg_pattern, word_idx):
+    UNK = word_idx['<unk>']
+    INS = []
+
+    def load(pattern, out, label):
+        for doc in tokenize(pattern):
+            out.append(([word_idx.get(w, UNK) for w in doc], label))
+
+    load(pos_pattern, INS, 0)
+    load(neg_pattern, INS, 1)
+
+    def reader():
+        for doc, label in INS:
+            yield doc, label
+
+    return reader
+
+
+def train(word_idx):
+    """
+    IMDB training set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("train/pos/.*\.txt$"),
+        re.compile("train/neg/.*\.txt$"), word_idx)
+
+
+def test(word_idx):
+    """
+    IMDB test set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("test/pos/.*\.txt$"),
+        re.compile("test/neg/.*\.txt$"), word_idx)
+
+
+if __name__ == "__main__":
+    paddle.dataset.common.download(VOCAB_URL, 'text_classification', VOCAB_MD5)
+    paddle.dataset.common.download(DATA_URL, 'text_classification', DATA_MD5)
+    runtime_main(TestDistTextClassification2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index f53f7f3b354e60619b17d601ff3f55d2b8b059db..a2cc57425841100a2b61279d1b447b88ed4b9a54 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -1699,10 +1699,9 @@ class DistTransformer2x2(TestDistRunnerBase):
         exe.run(startup_prog)
         exe.run(pserver_prog)
 
-    def run_trainer(self, use_cuda, args):
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        TrainTaskConfig.use_gpu = use_cuda
-        sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model(
+    def run_trainer(self, args):
+        TrainTaskConfig.use_gpu = args.use_cuda
+        sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
             args.is_dist, not args.sync_mode)
 
         if args.is_dist:
@@ -1718,6 +1717,11 @@ class DistTransformer2x2(TestDistRunnerBase):
             TrainTaskConfig.batch_size = 20
             trainer_prog = fluid.default_main_program()
 
+        if args.use_cuda:
+            place = fluid.CUDAPlace(0)
+        else:
+            place = fluid.CPUPlace()
+
         startup_exe = fluid.Executor(place)
 
         TrainTaskConfig.local = not args.is_dist
diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py
index f3e740fc7027a4a562b836c3113b87d55062c185..835306edd0f17490dd10110db40f42dce30b25bb 100644
--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
@@ -122,4 +122,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
 
 
 if __name__ == "__main__":
+    import os
+    os.environ['CPU_NUM'] = '1'
+    os.environ['USE_CUDA'] = "FALSE"
     runtime_main(TestDistWord2vec2x2)
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index b5549c507ed753f4504afd655be59b444164e6f3..e97643cddef22465436051a41ef4b825e9634d23 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -345,7 +345,7 @@ class OpTest(unittest.TestCase):
                         actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                     "Output (" + out_name + ") has diff at " + str(place) +
                     "\nExpect " + str(expect_t) + "\n" + "But Got" +
-                    str(actual_t))
+                    str(actual_t) + " in class " + self.__class__.__name__)
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.recursive_sequence_lengths(),
                                          expect[1], "Output (" + out_name +
diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py
index 1de4a9d016a177944253d12094722d3a05614be2..810e8a1a8547a92de923877695178e780981edeb 100644
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
@@ -36,7 +36,11 @@ class TestAucOp(OpTest):
             "StatPos": stat_pos,
             "StatNeg": stat_neg
         }
-        self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
+        self.attrs = {
+            'curve': 'ROC',
+            'num_thresholds': num_thresholds,
+            "slide_steps": 1
+        }
 
         python_auc = metrics.Auc(name="auc",
                                  curve='ROC',
@@ -45,7 +49,6 @@ class TestAucOp(OpTest):
 
         self.outputs = {
             'AUC': np.array(python_auc.eval()),
-            'BatchAUC': np.array(python_auc.eval()),
             'StatPosOut': np.array(python_auc._stat_pos),
             'StatNegOut': np.array(python_auc._stat_neg)
         }
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
index f6eb8f2c6d8b94f92e24ff789c91efb53a645a46..0c5343a97d5ef0f97fc6b144dfc82174eacb8573 100644
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -20,6 +20,7 @@ import six
 import sys
 import collections
 import math
+import paddle.fluid as fluid
 from op_test import OpTest
 
 
@@ -32,7 +33,7 @@ class TestDetectionMAPOp(OpTest):
         self.detect = np.array(self.detect).astype('float32')
         self.mAP = np.array(self.mAP).astype('float32')
 
-        if (len(self.class_pos_count) > 0):
+        if len(self.class_pos_count) > 0:
             self.class_pos_count = np.array(self.class_pos_count).astype(
                 'int32')
             self.true_pos = np.array(self.true_pos).astype('float32')
@@ -273,7 +274,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp):
 class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
     def init_test_case(self):
         super(TestDetectionMAPOpMultiBatch, self).init_test_case()
-        self.class_pos_count = [0, 2, 1]
+        self.class_pos_count = [0, 2, 1, 0]
         self.true_pos_lod = [[0, 3, 2]]
         self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
         self.false_pos_lod = [[0, 3, 2]]
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 37cad73019c529f64868b6ad3c6e2fffe59cc0d8..856980e546eb55f4cd83f7f3862c714e0e996207 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -18,23 +18,27 @@ import time
 import unittest
 import os
 import sys
-import six
 import signal
 import subprocess
+import six
 import argparse
 
+import paddle.fluid as fluid
+
+RUN_STEP = 10
+
 
 class TestDistRunnerBase(object):
     def get_model(self, batch_size=2):
         raise NotImplementedError(
             "get_model should be implemented by child classes.")
 
-    def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
-                       trainers, sync_mode):
+    @staticmethod
+    def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers,
+                       sync_mode):
         # NOTE: import fluid until runtime, or else forking processes will cause error.
-        import paddle
-        import paddle.fluid as fluid
-        t = fluid.DistributeTranspiler()
+        config = fluid.DistributeTranspilerConfig()
+        t = fluid.DistributeTranspiler(config=config)
         t.transpile(
             trainer_id=trainer_id,
             program=main_program,
@@ -44,9 +48,9 @@ class TestDistRunnerBase(object):
         return t
 
     def run_pserver(self, args):
-        import paddle
-        import paddle.fluid as fluid
+
         self.get_model(batch_size=2)
+
         if args.mem_opt:
             fluid.memory_optimize(fluid.default_main_program())
         t = self.get_transpiler(args.trainer_id,
@@ -61,12 +65,10 @@ class TestDistRunnerBase(object):
         exe.run(startup_prog)
         exe.run(pserver_prog)
 
-    def run_trainer(self, use_cuda, args):
-        import paddle
-        import paddle.fluid as fluid
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    def run_trainer(self, args):
         test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
             self.get_model(batch_size=2)
+
         if args.mem_opt:
             fluid.memory_optimize(fluid.default_main_program())
         if args.is_dist:
@@ -74,16 +76,23 @@ class TestDistRunnerBase(object):
                                     fluid.default_main_program(),
                                     args.endpoints, args.trainers,
                                     args.sync_mode)
+
             trainer_prog = t.get_trainer_program()
         else:
             trainer_prog = fluid.default_main_program()
 
+        if args.use_cuda:
+            place = fluid.CUDAPlace(0)
+        else:
+            place = fluid.CPUPlace()
+
         startup_exe = fluid.Executor(place)
         startup_exe.run(fluid.default_startup_program())
 
         strategy = fluid.ExecutionStrategy()
         strategy.num_threads = 1
         strategy.allow_op_delay = False
+
         build_stra = fluid.BuildStrategy()
 
         if args.use_reduce:
@@ -92,7 +101,7 @@ class TestDistRunnerBase(object):
             build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
 
         exe = fluid.ParallelExecutor(
-            use_cuda,
+            args.use_cuda,
             loss_name=avg_cost.name,
             exec_strategy=strategy,
             build_strategy=build_stra)
@@ -103,27 +112,26 @@ class TestDistRunnerBase(object):
         ]
 
         feeder = fluid.DataFeeder(feed_var_list, place)
-        reader_generator = test_reader()
-
-        data = next(reader_generator)
-        first_loss, = exe.run(fetch_list=[avg_cost.name],
-                              feed=feeder.feed(data))
-        print(first_loss)
+        reader_generator = train_reader()
 
-        for i in six.moves.xrange(5):
-            data = next(reader_generator)
-            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
+        def get_data():
+            origin_batch = next(reader_generator)
+            if args.is_dist and args.use_reader_alloc:
+                new_batch = []
+                for offset, item in enumerate(origin_batch):
+                    if offset % 2 == args.trainer_id:
+                        new_batch.append(item)
+                return new_batch
+            else:
+                return origin_batch
 
-        data = next(reader_generator)
-        last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
-        print(last_loss)
+        for _ in six.moves.xrange(RUN_STEP):
+            loss, = exe.run(fetch_list=[avg_cost.name],
+                            feed=feeder.feed(get_data()))
+            print(loss)
 
 
 def runtime_main(test_class):
-    import paddle
-    import paddle.fluid as fluid
-    import paddle.fluid.core as core
-
     parser = argparse.ArgumentParser(description='Run dist test.')
     parser.add_argument(
         '--role', type=str, required=True, choices=['pserver', 'trainer'])
@@ -135,7 +143,10 @@ def runtime_main(test_class):
         '--current_endpoint', type=str, required=False, default="")
     parser.add_argument('--sync_mode', action='store_true')
     parser.add_argument('--mem_opt', action='store_true')
+    parser.add_argument('--use_cuda', action='store_true')
     parser.add_argument('--use_reduce', action='store_true')
+    parser.add_argument(
+        '--use_reader_alloc', action='store_true', required=False, default=True)
 
     args = parser.parse_args()
 
@@ -143,8 +154,7 @@ def runtime_main(test_class):
     if args.role == "pserver" and args.is_dist:
         model.run_pserver(args)
     else:
-        use_cuda = True if core.is_compiled_with_cuda() else False
-        model.run_trainer(use_cuda, args)
+        model.run_trainer(args)
 
 
 import paddle.compat as cpt
@@ -163,8 +173,10 @@ class TestDistBase(unittest.TestCase):
             self._find_free_port(), self._find_free_port())
         self._python_interp = "python"
         self._sync_mode = True
+        self._use_cuda = True
         self._mem_opt = False
         self._use_reduce = False
+        self._use_reader_alloc = True
         self._setup_config()
 
     def _find_free_port(self):
@@ -172,15 +184,15 @@ class TestDistBase(unittest.TestCase):
             s.bind(('', 0))
             return s.getsockname()[1]
 
-    def start_pserver(self, model_file, check_error_log):
+    def start_pserver(self, model_file, check_error_log, required_envs):
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
         ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
         ps0_cmd = ps_cmd % \
-            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
-             self._trainers)
+                  (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
+                   self._trainers)
         ps1_cmd = ps_cmd % \
-            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
-             self._trainers)
+                  (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
+                   self._trainers)
 
         if self._sync_mode:
             ps0_cmd += " --sync_mode"
@@ -198,9 +210,15 @@ class TestDistBase(unittest.TestCase):
             ps1_pipe = open("/tmp/ps1_err.log", "wb")
 
         ps0_proc = subprocess.Popen(
-            ps0_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe)
+            ps0_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps0_pipe,
+            env=required_envs)
         ps1_proc = subprocess.Popen(
-            ps1_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe)
+            ps1_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps1_pipe,
+            env=required_envs)
 
         if not check_error_log:
             return ps0_proc, ps1_proc, None, None
@@ -222,59 +240,60 @@ class TestDistBase(unittest.TestCase):
                                  (e, retry_times))
                 retry_times -= 1
 
-    def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
-        # TODO(typhoonzero): should auto adapt GPU count on the machine.
-        required_envs = {
-            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
-            "FLAGS_cudnn_deterministic": "1",
-            "CPU_NUM": "1"
-        }
+    def _run_local(self, model, envs, check_error_log):
 
-        if check_error_log:
-            required_envs["GLOG_v"] = "7"
-            required_envs["GLOG_logtostderr"] = "1"
+        cmd = "%s %s --role trainer" % (self._python_interp, model)
+
+        if self._use_cuda:
+            cmd += " --use_cuda"
+            env_local = {"CUDA_VISIBLE_DEVICES": "0"}
+        else:
+            env_local = {'CPU_NUM': '1'}
+
+        envs.update(env_local)
 
-        # Run local to get a base line
-        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
-        env_local.update(required_envs)
-        local_cmd = "%s %s --role trainer" % (self._python_interp, model_file)
         if not check_error_log:
+            err_log = open("/tmp/trainer.err.log", "wb")
             local_proc = subprocess.Popen(
-                local_cmd.split(" "),
+                cmd.split(" "),
                 stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                env=env_local)
+                stderr=err_log,
+                env=envs)
         else:
-            err_log = open("/tmp/trainer.err.log", "wb")
             local_proc = subprocess.Popen(
-                local_cmd.split(" "),
+                cmd.split(" "),
                 stdout=subprocess.PIPE,
-                stderr=err_log,
-                env=env_local)
+                stderr=subprocess.PIPE,
+                env=envs)
 
         local_proc.wait()
-        out, err = local_proc.communicate()
-        local_ret = cpt.to_text(out)
-        sys.stderr.write('local_loss: %s\n' % local_ret)
-        sys.stderr.write('local_stderr: %s\n' % err)
+        local_out, local_err = local_proc.communicate()
+        local_ret = cpt.to_text(local_out)
+
+        if check_error_log:
+            err_log.close()
+
+        sys.stderr.write('local_stdout: %s\n' % local_ret)
+        sys.stderr.write('local_stderr: %s\n' % local_err)
 
+        local_losses = local_ret.split("\n")
+        return local_losses
+
+    def _run_cluster(self, model, envs, check_error_log):
         # Run dist train to compare with local results
-        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file,
-                                                          check_error_log)
+        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
+                                                          check_error_log, envs)
         self._wait_ps_ready(ps0.pid)
         self._wait_ps_ready(ps1.pid)
-
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+
         tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
         tr0_cmd = tr_cmd % \
-            (self._python_interp, model_file, self._ps_endpoints,
-             0, ps0_ep, self._trainers)
+                  (self._python_interp, model, self._ps_endpoints,
+                   0, ps0_ep, self._trainers)
         tr1_cmd = tr_cmd % \
-            (self._python_interp, model_file, self._ps_endpoints,
-             1, ps1_ep, self._trainers)
+                  (self._python_interp, model, self._ps_endpoints,
+                   1, ps1_ep, self._trainers)
 
         if self._sync_mode:
             tr0_cmd += " --sync_mode"
@@ -285,18 +304,28 @@ class TestDistBase(unittest.TestCase):
         if self._use_reduce:
             tr0_cmd += " --use_reduce"
             tr1_cmd += " --use_reduce"
+        if self._use_reader_alloc:
+            tr0_cmd += " --use_reader_alloc"
+            tr1_cmd += " --use_reader_alloc"
+        if self._use_cuda:
+            tr0_cmd += " --use_cuda"
+            tr1_cmd += " --use_cuda"
+            env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+            env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        else:
+            env0 = {'CPU_NUM': '1'}
+            env1 = {'CPU_NUM': '1'}
+
+        env0.update(envs)
+        env1.update(envs)
 
-        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
-        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
-        env0.update(required_envs)
-        env1.update(required_envs)
         FNULL = open(os.devnull, 'w')
 
         tr0_pipe = subprocess.PIPE
         tr1_pipe = subprocess.PIPE
         if check_error_log:
-            print("tr0_cmd:", tr0_cmd)
-            print("tr1_cmd:", tr1_cmd)
+            print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
+            print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
             tr0_pipe = open("/tmp/tr0_err.log", "wb")
             tr1_pipe = open("/tmp/tr1_err.log", "wb")
 
@@ -313,17 +342,11 @@ class TestDistBase(unittest.TestCase):
 
         tr0_proc.wait()
         tr1_proc.wait()
-        out, err = tr0_proc.communicate()
-        sys.stderr.write('dist_stderr: %s\n' % err)
-        loss_data0 = cpt.to_text(out)
-        sys.stderr.write('dist_loss: %s\n' % loss_data0)
-        lines = loss_data0.split("\n")
-        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
-        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
-
-        local_lines = local_ret.split("\n")
-        local_first_loss = eval(local_lines[0])[0]
-        local_last_loss = eval(local_lines[1])[0]
+
+        tr0_out, tr0_err = tr0_proc.communicate()
+        tr0_loss_text = cpt.to_text(tr0_out)
+        tr1_out, tr1_err = tr1_proc.communicate()
+        tr1_loss_text = cpt.to_text(tr1_out)
 
         # close trainer file
         if check_error_log:
@@ -341,5 +364,47 @@ class TestDistBase(unittest.TestCase):
         ps1.wait()
         FNULL.close()
 
-        self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
-        self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)
+        # print log
+        sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
+        sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text)
+        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+
+        tr0_losses = tr0_loss_text.split("\n")
+        tr1_losses = tr1_loss_text.split("\n")
+
+        return tr0_losses, tr1_losses
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        # TODO(typhoonzero): should auto adapt GPU count on the machine.
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_cudnn_deterministic": "1",
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "7"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        local_losses\
+            = self._run_local(model_file, required_envs,
+                                       check_error_log)
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs,
+                                                   check_error_log)
+
+        for step_id in range(RUN_STEP):
+            local_loss = eval(local_losses[step_id])[0]
+            tr0_loss = eval(tr0_losses[step_id])[0]
+            tr1_loss = eval(tr1_losses[step_id])[0]
+            dist_loss = (tr0_loss + tr1_loss) / 2
+            print(str(local_loss) + ":" + str(dist_loss))
+            self.assertAlmostEqual(local_loss, dist_loss, delta=delta)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
new file mode 100644
index 0000000000000000000000000000000000000000..081d6e9273ebaf7af643b8481399d11d1ab60e00
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
@@ -0,0 +1,31 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import os
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistCTR2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_cuda = False
+
+    def test_dist_ctr(self):
+        self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index 09b1c546e49bd02bf336f31885bf4c7339cc5a2c..f65dd7e2a28c4ace3988c0cc1267ebe981fbd9cb 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -23,7 +23,7 @@ class TestDistMnist2x2(TestDistBase):
         self._use_reduce = False
 
     def test_dist_train(self):
-        self.check_with_place("dist_mnist.py", delta=1e-7)
+        self.check_with_place("dist_mnist.py", delta=1e-5)
 
 
 class TestDistMnist2x2WithMemopt(TestDistBase):
@@ -32,7 +32,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase):
         self._mem_opt = True
 
     def test_dist_train(self):
-        self.check_with_place("dist_mnist.py", delta=1e-7)
+        self.check_with_place("dist_mnist.py", delta=1e-5)
 
 
 class TestDistMnistAsync(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index c2b089694ea2f329e67ad6c50def26caa454720e..d2d927aca8428acd88a6a73c05d70e93439f861c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -20,9 +20,10 @@ from test_dist_base import TestDistBase
 class TestDistSeResneXt2x2(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
+        self._use_reader_alloc = False
 
     def test_dist_train(self):
-        self.check_with_place("dist_se_resnext.py", delta=1e-7)
+        self.check_with_place("dist_se_resnext.py", delta=100)
 
 
 # TODO(typhoonzero): fix this test
@@ -38,6 +39,7 @@ class TestDistSeResneXt2x2(TestDistBase):
 class TestDistSeResneXt2x2Async(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
+        self._use_reader_alloc = False
 
     def test_dist_train(self):
         self.check_with_place("dist_se_resnext.py", delta=100)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bc707c245ab13dd2dbe50b953ef5308aba05b78
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
@@ -0,0 +1,79 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import os
+import unittest
+
+from test_dist_base import TestDistBase
+
+
+class TestDistSimnetBowDense2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_cuda = False
+
+    def test_simnet_bow(self):
+        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=1e-5,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+class TestDistSimnetBow2x2DenseAsync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._use_cuda = False
+
+    def test_simnet_bow(self):
+        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=100,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+class TestDistSimnetBowSparse2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_cuda = False
+
+    def test_simnet_bow(self):
+        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=1e-5,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+class TestDistSimnetBow2x2SparseAsync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._use_cuda = False
+
+    def test_simnet_bow(self):
+        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=100,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..b830c965caf2e47c5cc648bc98960459fa6b30ee
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
@@ -0,0 +1,40 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistTextClassification2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_cuda = False
+
+    def test_text_classification(self):
+        self.check_with_place("dist_text_classification.py", delta=1e-6)
+
+
+class TestDistTextClassification2x2Async(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._use_cuda = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_text_classification.py", delta=100)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index ecde407e6d85ea1bfc0181b4b60e095ea496fb1a..54a1c68a37f6929890aab697b48d621e6effb7d8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -264,6 +264,25 @@ class TestLRDecay(TranspilerTest):
         ])
 
 
+class TestDecayedAdagrad(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        opt = fluid.optimizer.DecayedAdagrad(learning_rate=0.1)
+        opt.minimize(avg_cost)
+
+    def transpiler_test_impl(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer, _ = self.get_trainer()
+
+
 class TestLRDecayConditional(TranspilerTest):
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
index 33b39b262b95b0013e3696c3f15a288a2e801ce1..b26cbdbea12962a3a41036c774de5dfb61999205 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -39,7 +39,7 @@ class TestDistW2V2x2Async(TestDistBase):
         self._sync_mode = False
 
     def test_dist_train(self):
-        self.check_with_place("dist_word2vec.py", delta=1)
+        self.check_with_place("dist_word2vec.py", delta=100)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index 86e27fe29ed945ec77fbbcdbd1c7cc6ecfba0fd5..9340d558577b4b3141df9317900ee33bbb683a0e 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -277,7 +277,6 @@ class TestGenerateProposalsOp(OpTest):
             'eta': self.eta
         }
 
-        print("lod = ", self.lod)
         self.outputs = {
             'RpnRois': (self.rpn_rois[0], [self.lod]),
             'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod])
@@ -295,7 +294,7 @@ class TestGenerateProposalsOp(OpTest):
         self.post_nms_topN = 5000  # train 6000, test 1000
         self.nms_thresh = 0.7
         self.min_size = 3.0
-        self.eta = 0.8
+        self.eta = 1.
 
     def init_test_input(self):
         batch_size = 1
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index f474cdae2054531d44724e0e3e0e58a35fb8ddcd..b8dc9e8ad7cd7cd100d5c3cb99319e6f5a37da91 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -541,7 +541,7 @@ class TestBook(unittest.TestCase):
         with program_guard(program):
             input = layers.data(
                 name="input", shape=[3, 100, 100], dtype="float32")
-            out = layers.shape(input, name="shape")
+            out = layers.shape(input)
             self.assertIsNotNone(out)
         print(str(program))
 
@@ -758,6 +758,65 @@ class TestBook(unittest.TestCase):
             out = layers.expand(x, [1, 2])
         print(str(program))
 
+    def test_uniform_random_batch_size_like(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+            out = layers.uniform_random_batch_size_like(input, [-1, 11])
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_gaussian_random(self):
+        program = Program()
+        with program_guard(program):
+            out = layers.gaussian_random(shape=[20, 30])
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_sampling_id(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(
+                name="X",
+                shape=[13, 11],
+                dtype='float32',
+                append_batch_size=False)
+
+            out = layers.sampling_id(x)
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_gaussian_random_batch_size_like(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+
+            out = layers.gaussian_random_batch_size_like(
+                input, shape=[-1, 11], mean=1.0, std=2.0)
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_sum(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+
+            out = layers.sum(input)
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_slice(self):
+        starts = [1, 0, 2]
+        ends = [3, 3, 4]
+        axes = [0, 1, 2]
+
+        program = Program()
+        with program_guard(program):
+            input = layers.data(
+                name="input", shape=[3, 4, 5, 6], dtype='float32')
+
+            out = layers.slice(input, axes=axes, starts=starts, ends=ends)
+
     def test_softshrink(self):
         program = Program()
         with program_guard(program):
diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..288c5f6a1f6b1760ca40c0c653e4c0726b799519
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+import unittest
+import os
+import sys
+import math
+
+
+def simple_fc_net():
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = img
+    for _ in range(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestPassBuilder(unittest.TestCase):
+    def check_network_convergence(self, use_cuda, build_strategy=None):
+        os.environ['CPU_NUM'] = str(4)
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = simple_fc_net()
+            test_program = main.clone(for_test=True)
+
+            opt = fluid.optimizer.SGD(learning_rate=0.001)
+            opt.minimize(loss)
+
+            batch_size = 32
+            image = np.random.normal(size=(batch_size, 784)).astype('float32')
+            label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
+
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup)
+            feed_dict = {'image': image, 'label': label}
+
+            train_exe = fluid.ParallelExecutor(
+                use_cuda=use_cuda,
+                loss_name=loss.name,
+                main_program=main,
+                build_strategy=build_strategy)
+
+            test_exe = fluid.ParallelExecutor(
+                use_cuda=use_cuda,
+                main_program=test_program,
+                share_vars_from=train_exe,
+                build_strategy=build_strategy)
+
+            for i in range(5):
+                test_loss, = test_exe.run([loss.name], feed=feed_dict)
+
+                train_loss, = train_exe.run([loss.name], feed=feed_dict)
+
+                avg_test_loss_val = np.array(test_loss).mean()
+                if math.isnan(float(avg_test_loss_val)):
+                    sys.exit("got NaN loss, testing failed.")
+
+                avg_train_loss_val = np.array(train_loss).mean()
+                if math.isnan(float(avg_train_loss_val)):
+                    sys.exit("got NaN loss, training failed.")
+
+                self.assertTrue(
+                    np.allclose(
+                        train_loss, test_loss, atol=1e-8),
+                    "Train loss: " + str(train_loss) + "\n Test loss:" +
+                    str(test_loss))
+
+    def test_parallel_testing_with_new_strategy(self):
+        build_strategy = fluid.BuildStrategy()
+        pass_builder = build_strategy._create_passes_from_strategy()
+        origin_len = len(pass_builder.all_passes())
+
+        viz_pass = pass_builder.append_pass("graph_viz_pass")
+        self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
+
+        pass_builder.insert_pass(
+            len(pass_builder.all_passes()), "graph_viz_pass")
+        self.assertEqual(origin_len + 2, len(pass_builder.all_passes()))
+
+        pass_builder.remove_pass(len(pass_builder.all_passes()) - 1)
+        self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
+        viz_pass.set_str("graph_viz_path", "/tmp/test_viz_pass")
+
+        self.check_network_convergence(
+            use_cuda=core.is_compiled_with_cuda(),
+            build_strategy=build_strategy)
+        try:
+            os.stat("/tmp/test_viz_pass")
+        except os.error:
+            self.assertFalse(True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index d9cc709f7428abcf415fe9a27b0d48a157ed0b41..ecdbe27f4d90268d755a712e25289cfaf4715f29 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -39,8 +39,8 @@ import six
 from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
 from ..framework import Program, default_main_program, \
-                        default_startup_program, Block, \
-                        Parameter, grad_var_name
+    default_startup_program, Block, \
+    Parameter, grad_var_name
 from .details import *
 from functools import reduce
 
@@ -178,7 +178,7 @@ class DistributeTranspiler(object):
                                                                 pserver_program)
            elif role == "TRAINER":
                 trainer_program = t.get_trainer_program()
-           
+
            # for nccl2 mode
            config = fluid.DistributeTranspilerConfig()
            config.mode = "nccl2"
@@ -470,7 +470,10 @@ class DistributeTranspiler(object):
         """
         # remove optimize ops and add a send op to main_program
         # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
+        lr_ops = self._get_lr_ops()
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
+        delete_ops(self.origin_program.global_block(), lr_ops)
+
         self.origin_program.__str__()
 
         if wait_port:
@@ -534,7 +537,7 @@ class DistributeTranspiler(object):
             })
 
         for varname, splited_var in six.iteritems(self.param_var_mapping):
-            #add concat ops to merge splited parameters received from parameter servers.
+            # add concat ops to merge splited parameters received from parameter servers.
             if len(splited_var) <= 1:
                 continue
             # NOTE: if enable memory optimization, origin vars maybe removed.
@@ -734,19 +737,14 @@ in a single call.")
             table_opt_block = self._create_table_optimize_block(
                 pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
             optimize_blocks.append(table_opt_block)
-            prefetch_var_name_to_block_id = self._create_prefetch_block(
+            lookup_table_var_name_to_block_id = self._create_prefetch_block(
                 pserver_index, pserver_program, table_opt_block)
             checkpoint_block_id = self._create_checkpoint_save_block(
                 pserver_program, table_opt_block.idx)
 
             pserver_program._distributed_lookup_table = self.table_name
-
-        # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
-        # not be executed, so it's safe to use optimize_block to hold the place
-        if self.has_distributed_lookup_table:
-            assert len(prefetch_var_name_to_block_id) > 0
-        else:
-            assert len(prefetch_var_name_to_block_id) == 0
+            prefetch_var_name_to_block_id.extend(
+                lookup_table_var_name_to_block_id)
 
         attrs = {
             "optimize_blocks": optimize_blocks,
@@ -755,11 +753,14 @@ in a single call.")
             "sync_mode": self.sync_mode,
             "grad_to_block_id": grad_to_block_id,
         }
-        if len(prefetch_var_name_to_block_id) > 0:
-            attrs['prefetch_var_name_to_block_id'] \
-                = prefetch_var_name_to_block_id
+
+        if self.has_distributed_lookup_table:
             attrs['checkpint_block_id'] = checkpoint_block_id
 
+        if len(prefetch_var_name_to_block_id) > 0:
+            attrs[
+                'prefetch_var_name_to_block_id'] = prefetch_var_name_to_block_id
+
         # step5 append the listen_and_serv op
         pserver_program.global_block().append_op(
             type="listen_and_serv",
@@ -1013,7 +1014,7 @@ to transpile() call.")
         for g, p in zip(grad_blocks, param_blocks):
             g_name, g_bid, _ = g.split(":")
             p_name, p_bid, _ = p.split(":")
-            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
+            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \
                 self.param_var_mapping[p_name][int(p_bid)]
 
         # create mapping of endpoint -> split var to create pserver side program
@@ -1320,7 +1321,7 @@ to transpile() call.")
             if len(splited) == 1:
                 if self.sync_mode and add_trainer_suffix:
                     new_var_name = "%s.trainer_%d" % \
-                        (orig_var.name, self.trainer_id)
+                                   (orig_var.name, self.trainer_id)
                     program.global_block()._rename_var(varname, new_var_name)
                     var_mapping[varname] = \
                         [program.global_block().var(new_var_name)]
@@ -1343,10 +1344,10 @@ to transpile() call.")
                 new_var_name = ""
                 if self.sync_mode and add_trainer_suffix:
                     new_var_name = "%s.block%d.trainer_%d" % \
-                        (varname, i, self.trainer_id)
+                                   (varname, i, self.trainer_id)
                 else:
                     new_var_name = "%s.block%d" % \
-                        (varname, i)
+                                   (varname, i)
                 var = program.global_block().create_var(
                     name=new_var_name,
                     persistable=False,
@@ -1430,6 +1431,9 @@ to transpile() call.")
         elif op_type == "rmsprop":
             if varkey in ["Moment", "MeanSquare"]:
                 return param_shape
+        elif op_type == "decayed_adagrad":
+            if varkey == "Moment":
+                return param_shape
         elif op_type == "sgd":
             pass
         return orig_shape
@@ -1484,9 +1488,8 @@ to transpile() call.")
             vars2merge = []
             for i in range(self.trainer_num):
                 per_trainer_name = "%s.trainer_%d" % \
-                (merged_var_name, i)
+                                   (merged_var_name, i)
                 vars2merge.append(pserver_block.vars[per_trainer_name])
-
             optimize_block.append_op(
                 type="sum",
                 inputs={"X": vars2merge},
@@ -1645,7 +1648,7 @@ to transpile() call.")
         # one op's output is another op's input, we say
         # the two operator is connected.
         if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \
-           set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()):
+                set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()):
             return True
         return False
 
@@ -1662,7 +1665,7 @@ to transpile() call.")
 
     def _is_optimizer_op(self, op):
         if "Param" in op.input_names and \
-            "LearningRate" in op.input_names:
+                "LearningRate" in op.input_names:
             return True
         return False
 
@@ -1737,7 +1740,7 @@ to transpile() call.")
                 # NOTE: we need to skip all optimize ops, since it is connected
                 # with forward/backward ops and lr ops, we only need the lr ops.
                 if op1 != op2 and self._is_op_connected(op1, op2) and \
-                    not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2):
+                        not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2):
                     ufind.union(op1, op2)
         # find all ops which is related with lr var
         for op1 in block.ops: