diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
index b810dc941d27fdb5004812ab58e105502e83280f..00650946ff2e658cfad0e63a8f1e008902a2d36e 100644
--- a/doc/fluid/dev/releasing_process_en.md
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -1,6 +1,6 @@
 # PaddlePaddle Releasing Process
 
-PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics.
+PaddlePaddle manages its branches using Trunk Based Development, and [Semantic Versioning](http://semver.org/) as it's version number semantics.
 
 Each time we release a new PaddlePaddle version, we should follow the below steps:
 
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index abd5459f6d47da6d1341284916b419325dc5977c..a8e0c4a3fedfd56e38de7568be6b3f2e76a4b25f 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -28,10 +28,20 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
+if(WITH_GPU)
+  cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle
+          all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
+endif()
+
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
         scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
 
-cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
+if(WITH_GPU)
+  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass)
+else()
+  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
+endif()
+
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
 
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index d9fcd92427ef38b131b4ce782c0ada37765682db..e98f1ab148db083ac63a1afd43e334fbfae62539 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -32,6 +32,10 @@ struct ComputationOpHandle : public OpHandleBase {
 
   std::string Name() const override;
 
+  const Scope *GetScope() const { return scope_; }
+
+  const platform::Place &GetPlace() const { return place_; }
+
  protected:
   void RunImpl() override;
 
diff --git a/paddle/fluid/framework/details/reference_count_op_handle.h b/paddle/fluid/framework/details/reference_count_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..71db8d952f4c205b875ad254dc19c0c1f74e61b3
--- /dev/null
+++ b/paddle/fluid/framework/details/reference_count_op_handle.h
@@ -0,0 +1,123 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+using ReferenceCountMap = std::unordered_map<std::string, int>;
+using AtomicReferenceCountMap =
+    std::unordered_map<std::string, std::atomic<int>>;
+using DeviceReferenceCountMap =
+    std::unordered_map<int, std::unique_ptr<ReferenceCountMap>>;
+using AtomicDeviceReferenceCountMap =
+    std::unordered_map<int, std::unique_ptr<AtomicReferenceCountMap>>;
+using DeviceGarbageCollectorMap =
+    std::unordered_map<int,
+                       std::unique_ptr<GarbageCollector<framework::Tensor>>>;
+
+class ReferenceCountOpHandle : public OpHandleBase {
+ public:
+  ReferenceCountOpHandle(ir::Node *node, const Scope *scope,
+                         const platform::CUDAPlace &place,
+                         const std::vector<std::string> &var_names,
+                         GarbageCollector<Tensor> *gc,
+                         AtomicReferenceCountMap *ref_cnts)
+      : OpHandleBase(node),
+        scope_(scope),
+        var_names_(var_names),
+        gc_(gc),
+        ref_cnts_(ref_cnts) {
+    dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    if (IsStreamGarabageCollector()) {
+      PADDLE_ENFORCE(cudaSetDevice(place.device));
+      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+    }
+  }
+
+  ~ReferenceCountOpHandle() {
+    if (IsStreamGarabageCollector()) {
+      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
+      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
+      PADDLE_ENFORCE(cudaEventDestroy(event_));
+    }
+  }
+
+  std::string Name() const override { return "reference_count"; }
+
+ protected:
+  void RunImpl() override {
+    auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    std::vector<LoDTensor *> tensors;
+    for (auto &name : var_names_) {
+      auto it = ref_cnts_->find(name);
+      if (it == ref_cnts_->end()) continue;
+
+      auto *var = exec_scope->FindVar(name);
+      if (var == nullptr || !var->IsType<LoDTensor>()) continue;
+
+      if (it->second.fetch_sub(1) <= 1) {
+        tensors.emplace_back(var->GetMutable<LoDTensor>());
+      }
+    }
+
+    if (!tensors.empty()) {
+      ClearTensors(tensors);
+    }
+  }
+
+ private:
+  void ClearTensors(const std::vector<LoDTensor *> &tensors) {
+    auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_);
+    if (gc != nullptr) {
+      auto compute_stream = dev_ctx_->stream();
+      auto callback_stream = gc->stream();
+      auto callback_func = [=]() {
+        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
+        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
+      };
+      gc_->Add(tensors, callback_func);
+    } else {
+      gc_->Add(tensors);
+    }
+  }
+
+  bool IsStreamGarabageCollector() const {
+    return dynamic_cast<const StreamGarbageCollector<Tensor> *>(gc_) != nullptr;
+  }
+
+  const Scope *scope_;
+  platform::CUDADeviceContext *dev_ctx_;
+  std::vector<std::string> var_names_;
+  GarbageCollector<Tensor> *gc_;       // not own
+  AtomicReferenceCountMap *ref_cnts_;  // not own
+  cudaEvent_t event_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..344754d5a1e119c04cae08ad50126924b5824315
--- /dev/null
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/reference_count_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto &ref_cnts = Get<DeviceReferenceCountMap>(kGlobalReferenceCount);
+  auto &cur_ref_cnts = Get<AtomicDeviceReferenceCountMap>(kCurReferenceCount);
+  auto &gcs = Get<DeviceGarbageCollectorMap>(kGarbageCollector);
+
+  // It is not easy to find the right reference counts of varaibles in graph
+  // Step 1: Find all variables in computation ops
+  // Step 2: Find all variables in non-computation ops which refers to variables
+  // in computation ops
+  std::unordered_set<std::string> names;
+  auto get_ref_cnts_from_compute_op = [&](
+      const std::unique_ptr<OpHandleBase> &op,
+      const std::vector<VarHandleBase *> &vars) {
+    std::vector<std::string> var_names_in_op;
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
+    if (compute_op == nullptr ||
+        !platform::is_gpu_place(compute_op->GetPlace()))
+      return var_names_in_op;
+    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
+    for (VarHandleBase *var_handle_base : vars) {
+      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
+      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
+
+      if (!platform::is_gpu_place(var_handle->place_) ||
+          boost::get<platform::CUDAPlace>(var_handle->place_) != place)
+        continue;
+
+      VarDesc *var_desc = var_handle->Node()->Var();
+      auto var_name = var_handle->Node()->Name();
+
+      // This is wierd but there is really some variables without var_desc
+      // in computation_op
+      if (var_desc == nullptr) {
+        if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr)
+          continue;
+      } else {
+        if (var_desc->Persistable() ||
+            var_desc->Proto()->type().type() != proto::VarType::LOD_TENSOR)
+          continue;
+      }
+
+      // compute op only runs in one device
+      if (ref_cnts[place.device]->count(var_name))
+        ++(*ref_cnts[place.device])[var_name];
+      else
+        (*ref_cnts[place.device])[var_name] = 1;
+
+      names.insert(var_name);
+      var_names_in_op.push_back(var_name);
+    }
+    return var_names_in_op;
+  };
+
+  auto update_ref_cnts_from_non_compute_op = [&](
+      const std::unique_ptr<OpHandleBase> &op,
+      const std::vector<VarHandleBase *> &vars) {
+    if (dynamic_cast<ComputationOpHandle *>(op.get()) != nullptr) return;
+    for (VarHandleBase *var_handle_base : vars) {
+      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
+      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
+
+      auto var_name = var_handle->Node()->Name();
+      auto var_place = var_handle->place_;
+      if (!platform::is_gpu_place(var_place)) continue;
+      auto place = boost::get<platform::CUDAPlace>(var_place);
+      if (names.count(var_name) == 0) continue;
+      if (ref_cnts.count(place.device) &&
+          ref_cnts[place.device]->count(var_name)) {
+        ++(*ref_cnts[place.device])[var_name];
+      }
+    }
+  };
+
+  std::unordered_map<OpHandleBase *, ReferenceCountOpHandle *>
+      compute_ref_cnt_map;
+  auto &all_ops = graph->Get<GraphOps>(kGraphOps);
+  for (auto &op : all_ops) {
+    auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs());
+    auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs());
+    if (in_var_names.empty() && out_var_names.empty()) continue;
+    in_var_names.insert(in_var_names.end(), out_var_names.begin(),
+                        out_var_names.end());
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
+    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
+    ir::Node *ref_cnt_node =
+        graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation);
+    auto *ref_cnt_handle = new ReferenceCountOpHandle(
+        ref_cnt_node, compute_op->GetScope(), place, in_var_names,
+        gcs[place.device].get(), cur_ref_cnts[place.device].get());
+    auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+    compute_op->AddOutput(dep_var);
+    ref_cnt_handle->AddInput(dep_var);
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+    compute_ref_cnt_map[compute_op] = ref_cnt_handle;
+  }
+
+  for (auto &op : all_ops) {
+    update_ref_cnts_from_non_compute_op(op, op->Inputs());
+    update_ref_cnts_from_non_compute_op(op, op->Outputs());
+  }
+
+  std::vector<std::unique_ptr<OpHandleBase>> new_all_ops;
+  new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size());
+  for (auto &op : all_ops) {
+    new_all_ops.emplace_back(std::move(op));
+    auto it = compute_ref_cnt_map.find(new_all_ops.back().get());
+    if (it != compute_ref_cnt_map.end()) {
+      new_all_ops.emplace_back(it->second);
+    }
+  }
+
+  all_ops.swap(new_all_ops);
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(reference_count_pass,
+              paddle::framework::details::ReferenceCountPass)
+    .RequirePassAttr(paddle::framework::details::kGlobalReferenceCount)
+    .RequirePassAttr(paddle::framework::details::kCurReferenceCount)
+    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
diff --git a/paddle/fluid/framework/details/reference_count_pass.h b/paddle/fluid/framework/details/reference_count_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..7081280b0600b9c1985987d02d679c298ad4b8bd
--- /dev/null
+++ b/paddle/fluid/framework/details/reference_count_pass.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/details/reference_count_op_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+constexpr char kGlobalReferenceCount[] = "reference_count";
+constexpr char kCurReferenceCount[] = "current_reference_count";
+constexpr char kGarbageCollector[] = "garbage_collector";
+
+class ReferenceCountPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 5bd974d6b789a2f085c0a69de5e133187342f587..e5b1eaa7318aecde1dbf89de8fe242a3008db97c 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -18,6 +18,9 @@
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/framework/details/reference_count_op_handle.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -65,12 +68,28 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
 
   platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
   drop_scope_counter_ += 1;
+
+#ifdef PADDLE_WITH_CUDA
+  const std::string gc_name = "garbage_collector";
+  DeviceGarbageCollectorMap *gc =
+      Graph().Has(gc_name) ? &(Graph().Get<DeviceGarbageCollectorMap>(gc_name))
+                           : nullptr;
+#endif
+
   if (!fetch_tensors.empty() ||
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
     drop_scope_counter_ = 0;
     // Wait All computational streams
     for (auto p : places_) {
       platform::DeviceContextPool::Instance().Get(p)->Wait();
+#ifdef PADDLE_WITH_CUDA
+      if (gc != nullptr && platform::is_gpu_place(p)) {
+        auto gpu_place = boost::get<platform::CUDAPlace>(p);
+        auto &gc_at_place = gc->at(gpu_place.device);
+        gc_at_place->Wait();
+        gc_at_place->Reset();
+      }
+#endif
     }
     for (auto &scope : local_scopes_) {
       auto &local_scope =
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index dad170ed78c64202b5c812bd8682887fe3b736d6..8d8042a0563a21dad216ffd53a474322c378ace6 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -37,7 +37,11 @@ int kProgramId = -1;
 
 ExecutorPrepareContext::ExecutorPrepareContext(
     const framework::ProgramDesc& prog, size_t block_id)
-    : prog_(prog), block_id_(block_id) {}
+    : prog_(prog), block_id_(block_id) {
+  if (GetEagerDeletionThreshold() >= 0) {
+    ref_cnts_ = GetNonPersistableReferenceCount<int>(prog_, block_id_);
+  }
+}
 
 ExecutorPrepareContext::~ExecutorPrepareContext() {
   VLOG(5) << "destroy ExecutorPrepareContext";
@@ -329,15 +333,81 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
     CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
   }
 
+  int64_t max_memory_size = GetEagerDeletionThreshold();
+
+  std::unique_ptr<GarbageCollector<Tensor>> gc;
+  if (max_memory_size >= 0) {
+    ctx->ResetReferenceCount();
+#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(place_)) {
+      gc.reset(new DefaultStreamGarbageCollector<Tensor>(
+          boost::get<platform::CUDAPlace>(place_), max_memory_size));
+    } else {
+#endif
+      gc.reset(new CPUGarbageCollector<Tensor>(
+          boost::get<platform::CPUPlace>(place_), max_memory_size));
+#ifdef PADDLE_WITH_CUDA
+    }
+#endif
+  }
+
   for (auto& op : ctx->ops_) {
     op->Run(*local_scope, place_);
 
+    if (gc != nullptr) {
+      std::vector<std::string> erase_vars;
+      for (auto& input : op->Inputs()) {
+        for (auto& input_name : input.second) {
+          auto it = ctx->cur_ref_cnts_.find(input_name);
+          if (it == ctx->cur_ref_cnts_.end()) continue;
+          if (it->second == 1) {  // should delete it
+            erase_vars.emplace_back(input_name);
+            ctx->cur_ref_cnts_.erase(input_name);
+          } else {
+            --(it->second);
+          }
+        }
+      }
+
+      for (auto& output : op->Outputs()) {
+        for (auto& output_name : output.second) {
+          auto it = ctx->cur_ref_cnts_.find(output_name);
+          if (it == ctx->cur_ref_cnts_.end()) continue;
+          if (it->second == 1) {
+            erase_vars.emplace_back(output_name);
+            ctx->cur_ref_cnts_.erase(output_name);
+          } else {
+            --(it->second);
+          }
+        }
+      }
+
+      if (!erase_vars.empty()) {
+        std::vector<framework::LoDTensor*> erase_tensors;
+        for (auto& name : erase_vars) {
+          auto* var = local_scope->FindVar(name);
+          if (var == nullptr) continue;
+          if (var->IsType<framework::LoDTensor>()) {
+            auto* tensor = var->GetMutable<framework::LoDTensor>();
+            erase_tensors.push_back(tensor);
+          }
+        }
+        if (!erase_tensors.empty()) gc->Add(erase_tensors);
+      }
+    }
+
     if (FLAGS_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
     }
   }
-  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+
+  if (gc != nullptr) {
+    gc->Wait();
+  } else {
+    platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  }
+
   if (local_scope != scope) {
     scope->DeleteScope(local_scope);
   } else {
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index f95808c199b9de693ec653c29374c9130be7fd59..f0cc1338a8af50030a70a9797cbcd1b0567272b5 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <map>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@@ -27,13 +28,58 @@ namespace paddle {
 namespace framework {
 extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
 
+template <typename T>
+std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
+    const ProgramDesc& prog, size_t block_id) {
+  auto& block = prog.Block(block_id);
+  std::unordered_set<std::string> ignored_vars;
+  std::unordered_map<std::string, T> ref_cnts;
+
+  for (auto var_desc : block.AllVars()) {
+    auto type = var_desc->Proto()->type().type();
+    if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) {
+      ignored_vars.insert(var_desc->Name());  // ignore persistable vars
+    }
+  }
+
+  for (auto op_desc : block.AllOps()) {
+    for (auto& input : op_desc->Inputs()) {
+      for (auto& input_name : input.second) {
+        if (!ignored_vars.count(input_name)) {
+          if (ref_cnts.count(input_name))
+            ++ref_cnts[input_name];
+          else
+            ref_cnts[input_name] = 1;
+        }
+      }
+    }
+
+    for (auto& output : op_desc->Outputs()) {
+      for (auto output_name : output.second) {
+        if (!ignored_vars.count(output_name)) {
+          if (ref_cnts.count(output_name))
+            ++ref_cnts[output_name];
+          else
+            ref_cnts[output_name] = 1;
+        }
+      }
+    }
+  }
+  return ref_cnts;
+}
+
 struct ExecutorPrepareContext {
   ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
   ~ExecutorPrepareContext();
 
+  void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; }
+
   const framework::ProgramDesc& prog_;
   size_t block_id_;
   std::vector<std::unique_ptr<OperatorBase>> ops_;
+
+  std::unordered_map<std::string, int> ref_cnts_;
+  std::unordered_map<std::string, int> cur_ref_cnts_;
 };
 
 class Executor {
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
new file mode 100644
index 0000000000000000000000000000000000000000..b403252c972d26da6deeca54ce88a9547ffe7afa
--- /dev/null
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -0,0 +1,163 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <mutex>  // NOLINT
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+// T should have memory_size() and clear() method
+template <typename T>
+class GarbageCollector {
+ public:
+  GarbageCollector(const platform::Place &place, size_t max_memory_size)
+      : max_memory_size_(std::max(max_memory_size, static_cast<size_t>(1))) {
+    garbages_.reset(new std::deque<T *>());
+    dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
+  }
+
+  virtual ~GarbageCollector() {}
+
+  void Reset() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    garbages_.reset(new std::deque<T *>());
+    cur_memory_size_ = 0;
+  }
+
+  template <typename Container>
+  void Add(const Container &objs) {
+    Add(objs, []() {});
+  }
+
+  template <typename Container, typename Callback>
+  void Add(const Container &objs, Callback &&callback) {
+    std::shared_ptr<std::deque<T *>> clear_deque;
+    {
+      std::lock_guard<std::mutex> guard(mutex_);
+      for (auto *obj : objs) {
+        garbages_->push_back(obj);
+        cur_memory_size_ += obj->memory_size();
+      }
+      if (cur_memory_size_ >= max_memory_size_) {
+        cur_memory_size_ = 0;
+        clear_deque = garbages_;
+        garbages_.reset(new std::deque<T *>());
+      }
+    }
+
+    if (clear_deque != nullptr) {
+      callback();
+      ClearCallback([=]() {
+        for (auto *obj : *clear_deque) obj->clear();
+      });
+    }
+  }
+
+  virtual void Wait() const {}
+
+ protected:
+  virtual void ClearCallback(const std::function<void()> &callback) = 0;
+
+  platform::DeviceContext *dev_ctx_;
+  std::shared_ptr<std::deque<T *>> garbages_;
+  mutable std::mutex mutex_;
+  const size_t max_memory_size_;
+  size_t cur_memory_size_ = 0;
+};
+
+template <typename T>
+class CPUGarbageCollector : public GarbageCollector<T> {
+ public:
+  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size)
+      : GarbageCollector<T>(place, max_memory_size) {}
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override {
+    callback();
+  }
+};
+
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+class DefaultStreamGarbageCollector : public GarbageCollector<T> {
+ public:
+  DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
+                                size_t max_memory_size)
+      : GarbageCollector<T>(place, max_memory_size) {}
+
+  cudaStream_t stream() const {
+    return static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
+        ->stream();
+  }
+
+  void Wait() const override {
+    this->dev_ctx_->Wait();
+    static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
+        ->WaitStreamCallback();
+  }
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override {
+    static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
+        ->AddStreamCallback(callback);
+  }
+};
+
+template <typename T>
+class StreamGarbageCollector : public GarbageCollector<T> {
+ public:
+  StreamGarbageCollector(const platform::CUDAPlace &place,
+                         size_t max_memory_size)
+      : GarbageCollector<T>(place, max_memory_size) {
+    PADDLE_ENFORCE(cudaSetDevice(place.device));
+    PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+    callback_manager_.reset(new platform::StreamCallbackManager(stream_));
+  }
+
+  ~StreamGarbageCollector() {
+    auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
+    PADDLE_ENFORCE(cudaSetDevice(place.device));
+    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+    PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+  }
+
+  void Wait() const override {
+    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+    std::lock_guard<std::mutex> guard(this->mutex_);
+    callback_manager_->Wait();
+  }
+
+  cudaStream_t stream() const { return stream_; }
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override {
+    std::lock_guard<std::mutex> guard(this->mutex_);
+    callback_manager_->AddCallback(callback);
+  }
+
+ private:
+  cudaStream_t stream_;
+  std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
+};
+#endif
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index ae8496204d4aeb88c04154d571325d440274e821..ab687e760a761d4e445726bd5149966adc2403d0 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -94,6 +94,14 @@ class Graph {
     };
   }
 
+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph",
+                   attr_name);
+    attrs_[attr_name] = attr;
+    attr_dels_[attr_name] = []() {};
+  }
+
   const std::unordered_set<ir::Node *> &Nodes() const { return node_set_; }
 
   // Create a normal variable with non-null VarDesc.
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 5b8c75a93de2ddd8f7260d2191c22a5945b3d2d9..ae393d66a3b3ec0141667b44b5d9f3158e434e37 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -188,6 +188,30 @@ ParallelExecutor::ParallelExecutor(
       main_program, member_->places_, loss_var_name, params,
       member_->local_scopes_, member_->use_cuda_, build_strategy,
       member_->nccl_ctxs_.get());
+
+  auto max_memory_size = GetEagerDeletionThreshold();
+  if (max_memory_size >= 0) {
+    for (auto &place : member_->places_) {
+      if (!platform::is_gpu_place(place)) continue;
+      auto gpu_place = boost::get<platform::CUDAPlace>(place);
+      if (gcs_[gpu_place.device] == nullptr) {
+        ref_cnts_[gpu_place.device].reset(new details::ReferenceCountMap());
+        cur_ref_cnts_[gpu_place.device].reset(
+            new details::AtomicReferenceCountMap());
+        gcs_[gpu_place.device].reset(
+            new StreamGarbageCollector<Tensor>(gpu_place, max_memory_size));
+      }
+    }
+    if (!gcs_.empty()) {
+      auto ref_cnt_pass =
+          ir::PassRegistry::Instance().Get("reference_count_pass");
+      ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_);
+      ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_);
+      ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
+      graph = ref_cnt_pass->Apply(std::move(graph));
+      graph->SetNotOwned("garbage_collector", &gcs_);
+    }
+  }
 #else
   std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
       main_program, member_->places_, loss_var_name, params,
@@ -310,6 +334,11 @@ void ParallelExecutor::BCastParamsToDevices(
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                            const std::string &fetched_var_name) {
   platform::RecordBlock b(0);
+#ifdef PADDLE_WITH_CUDA
+  if (!gcs_.empty()) {
+    ResetReferenceCount();
+  }
+#endif
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
       fetch_data;
@@ -367,3 +396,6 @@ USE_PASS(graph_viz_pass);
 USE_PASS(multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
+#ifdef PADDLE_WITH_CUDA
+USE_PASS(reference_count_pass);
+#endif
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 5fb748fa205d5e9dbd2943b615c69aedd0e7a26f..88e2078454024c3a4d437615d3e6b15ee0c7d6a1 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <paddle/fluid/framework/details/build_strategy.h>
+#include <atomic>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/details/execution_strategy.h"
@@ -27,6 +29,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/framework/details/reference_count_pass.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -70,6 +76,23 @@ class ParallelExecutor {
 
  private:
   ParallelExecutorPrivate *member_;
+
+#ifdef PADDLE_WITH_CUDA
+  // ref_cnts_ is only initialized when ParallelExecutor constructs, and then
+  // keeps unchanged
+  // Before each iteration, cur_ref_cnts_ is reset to ref_cnts_
+  details::DeviceReferenceCountMap ref_cnts_;
+  details::AtomicDeviceReferenceCountMap cur_ref_cnts_;
+  details::DeviceGarbageCollectorMap gcs_;
+
+  void ResetReferenceCount() {
+    for (auto &pair1 : ref_cnts_) {
+      for (auto &pair2 : *(pair1.second)) {
+        (*(cur_ref_cnts_[pair1.first]))[pair2.first] = pair2.second;
+      }
+    }
+  }
+#endif
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 2be655b89a4caf2bf9874dcab6bc0bdb2856a026..1a727a2c8c759d010606d5b605823b7252b35c69 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -31,9 +31,21 @@ DEFINE_bool(
     "Delete local scope eagerly. It will reduce GPU memory usage but "
     "slow down the destruction of variables.(around 1% performance harm)");
 
+DEFINE_double(
+    eager_delete_tensor_gb, -1.0,
+    "Memory size threshold (GB) when the garbage collector clear tensors."
+    "Disabled when this value is less than 0");
+
 namespace paddle {
 namespace framework {
 
+int64_t GetEagerDeletionThreshold() {
+  return FLAGS_eager_delete_tensor_gb < 0
+             ? -1
+             : static_cast<int64_t>(FLAGS_eager_delete_tensor_gb *
+                                    (static_cast<int64_t>(1) << 30));
+}
+
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index b6165a595d537c314a95685e8b1edbc42e387ab7..e42fff1d79d92fb7ed61768a614d8cd98f6775a0 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -26,6 +26,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+int64_t GetEagerDeletionThreshold();
+
 class Scope;
 
 /**
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 4cf95fa0ae07823289fbf337062190f05e6c6bcf..f1d268548578fea12082e2edb213a3749eccbfaf 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -151,6 +151,8 @@ class Tensor {
 
   void set_layout(const DataLayout layout) { layout_ = layout; }
 
+  void clear() { holder_ = nullptr; }
+
  private:
   /**
    * @note    Placeholder hides type T, so it doesn't appear as a template
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index 6dc39cae0522efd48c2e2921611adebd6937ddf7..8a8aeb5e09a0d9a6746f6d6d61c547363e0e2d30 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -69,8 +69,9 @@ class DfgPassManagerImpl final : public DfgPassManager {
     if (FLAGS_IA_enable_tensorrt_subgraph_engine) {
       auto trt_teller = [&](const Node* node) {
         std::unordered_set<std::string> teller_set(
-            {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax",
-             "depthwise_conv2d", "batch_norm", "concat"});
+            {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
+             "depthwise_conv2d", "batch_norm", "concat", "tanh",
+             "elementwise_add", "dropout"});
         if (!node->IsFunction()) return false;
 
         const auto* func = static_cast<const Function*>(node);
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index abee375313850f1490bacec11f737706c061a5e9..d9d6e139b8735c8f07c52f63c70b6b9805e03642 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -153,11 +153,21 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
 }  // namespace paddle
 
 USE_TRT_CONVERTER(elementwise_add_weight);
+USE_TRT_CONVERTER(elementwise_add_tensor);
+USE_TRT_CONVERTER(elementwise_sub_tensor);
+USE_TRT_CONVERTER(elementwise_div_tensor);
+USE_TRT_CONVERTER(elementwise_mul_tensor);
+USE_TRT_CONVERTER(elementwise_max_tensor);
+USE_TRT_CONVERTER(elementwise_min_tensor);
+USE_TRT_CONVERTER(elementwise_pow_tensor);
 USE_TRT_CONVERTER(mul);
 USE_TRT_CONVERTER(conv2d);
 USE_TRT_CONVERTER(relu);
+USE_TRT_CONVERTER(sigmoid);
+USE_TRT_CONVERTER(tanh);
 USE_TRT_CONVERTER(fc);
 USE_TRT_CONVERTER(pool2d);
 USE_TRT_CONVERTER(softmax);
 USE_TRT_CONVERTER(batch_norm);
 USE_TRT_CONVERTER(concat);
+USE_TRT_CONVERTER(dropout);
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 9d7be2d03cf7bb12afe7e52d9630f184d689dc25..fac1babf6ec6131f84d3e3b9fc6efedd9f9f6cfc 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Add TRT tests
 nv_library(tensorrt_converter
   SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc
+batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
   DEPS tensorrt_engine operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@@ -24,6 +24,8 @@ nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
 nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
-
 nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
+
+nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index 8168cdff1b85fc05d22fbec7fac6ab8892f3a907..e73c5bbf57501e4ff3c080a46d91685035652bfa 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -19,23 +19,31 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-class ReluOpConverter : public OpConverter {
+class ActivationOpConverter : public OpConverter {
  public:
-  ReluOpConverter() {}
+  ActivationOpConverter() {}
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
     framework::OpDesc op_desc(op, nullptr);
-    LOG(INFO) << "convert a fluid relu op to tensorrt activation layer whose "
-                 "type is Relu";
+    LOG(INFO)
+        << "convert a fluid Activation op to tensorrt activation layer whose "
+           "type is "
+        << op_type_;
     const nvinfer1::ITensor* input_tensor =
         engine_->GetITensor(op_desc.Input("X")[0]);
+
+    auto op_pair = ops.find(op_type_);
+    if (op_pair == ops.end()) {
+      PADDLE_THROW("Wrong activation op type!");
+    }
+
     nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
         engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
-        nvinfer1::ActivationType::kRELU);
+        op_pair->second);
     auto output_name = op_desc.Output("Out")[0];
-    layer->setName(("relu (Output: " + output_name + ")").c_str());
+    layer->setName((op_type_ + " (Output: " + output_name + ")").c_str());
     layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {  // the test framework can not determine which is the
@@ -43,6 +51,32 @@ class ReluOpConverter : public OpConverter {
       engine_->DeclareOutput(output_name);
     }
   }
+
+ protected:
+  std::string op_type_;
+  static const std::unordered_map<std::string, nvinfer1::ActivationType> ops;
+};
+
+const std::unordered_map<std::string, nvinfer1::ActivationType>
+    ActivationOpConverter::ops = {
+        {"relu", nvinfer1::ActivationType::kRELU},
+        {"sigmoid", nvinfer1::ActivationType::kSIGMOID},
+        {"tanh", nvinfer1::ActivationType::kTANH},
+};
+
+class ReluOpConverter : public ActivationOpConverter {
+ public:
+  ReluOpConverter() { op_type_ = "relu"; }
+};
+
+class SigmoidOpConverter : public ActivationOpConverter {
+ public:
+  SigmoidOpConverter() { op_type_ = "sigmoid"; }
+};
+
+class TanhOpConverter : public ActivationOpConverter {
+ public:
+  TanhOpConverter() { op_type_ = "tanh"; }
 };
 
 }  // namespace tensorrt
@@ -50,3 +84,5 @@ class ReluOpConverter : public OpConverter {
 }  // namespace paddle
 
 REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
+REGISTER_TRT_OP_CONVERTER(sigmoid, SigmoidOpConverter);
+REGISTER_TRT_OP_CONVERTER(tanh, TanhOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9533ecbcfda4e2500fd201d8efc64fc5bd97169a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * DropoutOp. This Layer doesn't has weights.
+ */
+class DropoutOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a fluid dropout op to tensorrt dropout layer";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    float dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob"));
+
+    platform::CPUPlace cpu_place;
+    std::unique_ptr<framework::LoDTensor> weight_tensor(
+        new framework::LoDTensor());
+    weight_tensor->Resize(framework::make_ddim({1}));
+    auto* weight_data =
+        weight_tensor->mutable_data<float>(platform::CPUPlace());
+    weight_data[0] = 1 - dropout_prob;
+
+    TensorRTEngine::Weight scale_weights{
+        nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
+        weight_tensor->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Scale, *const_cast<nvinfer1::ITensor*>(input1),
+        nvinfer1::ScaleMode::kUNIFORM, shift_weights.get(), scale_weights.get(),
+        power_weights.get());
+
+    engine_->weight_map[op_desc.Output("Out").front() + "_dropout"] =
+        std::move(weight_tensor);
+    auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("dropout (Output: " + output_name + ")").c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(dropout);
+REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index e82762ea03ecd00bce7cfb83b130a3436ccbfed3..dd3dfb0bc7b609e28462954835a0d40e0a63b6cd 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -20,18 +20,18 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-TEST(ReluOpConverter, main) {
+void test_activation(std::string act_type) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
   TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6));
-  validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6));
+  validator.DeclInputVar("act-X", nvinfer1::Dims2(10, 6));
+  validator.DeclOutputVar("act-Out", nvinfer1::Dims2(10, 6));
 
   // Prepare Op description
   framework::OpDesc desc;
-  desc.SetType("relu");
-  desc.SetInput("X", {"relu-X"});
-  desc.SetOutput("Out", {"relu-Out"});
+  desc.SetType(act_type);
+  desc.SetInput("X", {"act-X"});
+  desc.SetOutput("Out", {"act-Out"});
 
   LOG(INFO) << "set OP";
   validator.SetOp(*desc.Proto());
@@ -40,8 +40,16 @@ TEST(ReluOpConverter, main) {
   validator.Execute(5);
 }
 
+TEST(ReluOpConverter, main) { test_activation("relu"); }
+
+TEST(SigmoidOpConverter, main) { test_activation("sigmoid"); }
+
+TEST(TanhOpConverter, main) { test_activation("tanh"); }
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
 
 USE_OP(relu);
+USE_OP(sigmoid);
+USE_OP(tanh);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6b8e621b702d977f5868766a6eafb98c8522c3cd
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(DropoutOpConverter, main) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(8, parameters, scope, 1000);
+
+  std::vector<int> tensor_shape{8, 10};
+  validator.DeclInputVar("dropout-X", tensor_shape,
+                         nvinfer1::DimsCHW(10, 1, 1));
+  validator.DeclOutputVar("dropout-Out", nvinfer1::DimsCHW(10, 1, 1));
+  validator.DeclOutputVar("mask-Out", nvinfer1::DimsCHW(10, 1, 1));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  int is_test = 1;
+  float dropout_prob = 0.4;
+
+  desc.SetType("dropout");
+  desc.SetInput("X", {"dropout-X"});
+  desc.SetOutput("Mask", {"mask-Out"});
+  desc.SetOutput("Out", {"dropout-Out"});
+  desc.SetAttr("is_test", is_test);
+  desc.SetAttr("dropout_prob", dropout_prob);
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  std::unordered_set<std::string> neglected_output = {"mask-Out"};
+
+  validator.Execute(8, neglected_output);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(dropout);
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index c2f45fdc99b87bc12c2aadf1985de6e98a24fce7..26ef27c3caafadb4801b0ae52133f6175655ce0a 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -167,6 +167,8 @@ void BuddyAllocator::Free(void* p) {
 }
 
 size_t BuddyAllocator::Used() { return total_used_; }
+size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
+size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }
 
 void* BuddyAllocator::SystemAlloc(size_t size) {
   size_t index = 0;
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index f0c83efc23ce39c4fc89296d672e1e55751851bf..3f86a51f0d0b8504bbc4b0477f123093b343e9cf 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -42,6 +42,8 @@ class BuddyAllocator {
   void* Alloc(size_t unaligned_size);
   void Free(void* ptr);
   size_t Used();
+  size_t GetMinChunkSize();
+  size_t GetMaxChunkSize();
 
  public:
   // Disable copy and assignment
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 7c800b3c164049244770ceb2070b177d8307e85e..283745e977533358ef52521b36e67f0ada950e61 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -119,8 +119,8 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
     LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
                  << place.device << ", available " << avail << " bytes";
     LOG(WARNING) << "total " << total;
-    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
-    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
+    LOG(WARNING) << "GpuMinChunkSize " << buddy_allocator->GetMinChunkSize();
+    LOG(WARNING) << "GpuMaxChunkSize " << buddy_allocator->GetMaxChunkSize();
     LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
     platform::SetDeviceId(cur_dev);
   }
diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.cc b/paddle/fluid/operators/math/cpu_lstm_compute.cc
index f7c55c215bacdafc99da5fcd0b750a058dfed21c..58e6512021203664573a0478dade052f92dd70bb 100644
--- a/paddle/fluid/operators/math/cpu_lstm_compute.cc
+++ b/paddle/fluid/operators/math/cpu_lstm_compute.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
 http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,76 +10,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/cpu_lstm_compute.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 
 namespace paddle {
 namespace operators {
-namespace math {
-
-// TODO(TJ): ugly workaround, clean me
-template <typename T>
-void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht) {
-  // gates: W_ch, W_ih, W_fh, W_oh
-  vec_sigmoid<T, platform::jit::avx>(24, gates + 8, gates + 8);
-  vec_tanh<T, platform::jit::avx>(8, gates, gates);
-  const T *i = gates + 8, *f = gates + 16, *o = gates + 24;
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int d = 0; d < 8; ++d) {
-    // C_t = C_t-1 * fgated + cand_gated * igated
-    ct[d] = ct_1[d] * f[d] + gates[d] * i[d];
-    // H_t = act_cell(C_t) * ogated
-    T tmp = ct[d] * 2;
-    tmp = static_cast<T>(0) - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
-    vec_exp<T>(1, &tmp, &tmp);
-    tmp = static_cast<T>(2) / (static_cast<T>(1) + tmp) - static_cast<T>(1);
-    ht[d] = tmp * o[d];
-  }
-}
-
-#ifdef __AVX__
-namespace detail {
-namespace forward {
-namespace avx {
-__m256 Sigmoid(const __m256 a);
-__m256 Tanh(const __m256 a);
-}  // namespace avx
-}  // namespace forward
-}  // namespace detail
-
-template <>
-void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
-                              float* ht) {
-  namespace act = detail::forward::avx;
-  // gates: W_ch, W_ih, W_fh, W_oh
-  __m256 c, i, f, o;
-  c = _mm256_loadu_ps(gates);
-  i = _mm256_loadu_ps(gates + 8);
-  f = _mm256_loadu_ps(gates + 16);
-  o = _mm256_loadu_ps(gates + 24);
-
-  /* C_t = C_t-1 * fgated + cand_gated * igated*/
-  c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i));
-  i = _mm256_loadu_ps(ct_1);
-  f = _mm256_mul_ps(i, act::Sigmoid(f));
-  f = _mm256_add_ps(c, f);
-  _mm256_storeu_ps(ct, f);
-
-  /* H_t = act_cell(C_t) * ogated */
-  o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o));
-  _mm256_storeu_ps(ht, o);
-}
-#endif
-
-template void lstm_compute_ctht<float>(float* gates, const float* ct_1,
-                                       float* ct, float* ht);
-template void lstm_compute_ctht<double>(double* gates, const double* ct_1,
-                                        double* ct, double* ht);
-
-}  // namespace math
+namespace math {}  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.h b/paddle/fluid/operators/math/cpu_lstm_compute.h
index 244164f08c4bb70833a9bfc884982a4225945bf0..28b6f71729edf1b8cc5d610d76af78dea213313e 100644
--- a/paddle/fluid/operators/math/cpu_lstm_compute.h
+++ b/paddle/fluid/operators/math/cpu_lstm_compute.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
 http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,6 +11,11 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
 
 namespace paddle {
 namespace operators {
@@ -21,7 +23,58 @@ namespace math {
 
 // TODO(TJ): ugly workaround, clean me
 template <typename T>
-void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht);
+void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht) {
+  // gates: W_ch, W_ih, W_fh, W_oh
+  vec_sigmoid<T, platform::jit::avx>(24, gates + 8, gates + 8);
+  vec_tanh<T, platform::jit::avx>(8, gates, gates);
+  const T *i = gates + 8, *f = gates + 16, *o = gates + 24;
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int d = 0; d < 8; ++d) {
+    // C_t = C_t-1 * fgated + cand_gated * igated
+    ct[d] = ct_1[d] * f[d] + gates[d] * i[d];
+    // H_t = act_cell(C_t) * ogated
+    T tmp = ct[d] * 2;
+    tmp = static_cast<T>(0) - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
+    vec_exp<T>(1, &tmp, &tmp);
+    tmp = static_cast<T>(2) / (static_cast<T>(1) + tmp) - static_cast<T>(1);
+    ht[d] = tmp * o[d];
+  }
+}
+
+#ifdef __AVX__
+namespace detail {
+namespace forward {
+namespace avx {
+__m256 Sigmoid(const __m256 a);
+__m256 Tanh(const __m256 a);
+}  // namespace avx
+}  // namespace forward
+}  // namespace detail
+
+template <>
+void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
+                              float* ht) {
+  namespace act = detail::forward::avx;
+  // gates: W_ch, W_ih, W_fh, W_oh
+  __m256 c, i, f, o;
+  c = _mm256_loadu_ps(gates);
+  i = _mm256_loadu_ps(gates + 8);
+  f = _mm256_loadu_ps(gates + 16);
+  o = _mm256_loadu_ps(gates + 24);
+
+  /* C_t = C_t-1 * fgated + cand_gated * igated*/
+  c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i));
+  i = _mm256_loadu_ps(ct_1);
+  f = _mm256_mul_ps(i, act::Sigmoid(f));
+  f = _mm256_add_ps(c, f);
+  _mm256_storeu_ps(ct, f);
+
+  /* H_t = act_cell(C_t) * ogated */
+  o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o));
+  _mm256_storeu_ps(ht, o);
+}
+#endif
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index 65a3bc928e47ac60f06e7efc75f42703e45acbb4..791138a8c0eb3c477942a8b723206a8f8a3eac77 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -63,7 +63,7 @@ class WhileOp : public framework::OperatorBase {
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
-      executor.RunPreparedContext(ctx.get(), &current_scope, false);
+      executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true);
       if (is_test) {
         scope.DeleteScope(&current_scope);
       }
@@ -169,7 +169,8 @@ class WhileGradOp : public framework::OperatorBase {
           }
         }
       }
-      executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false);
+      executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false, true,
+                                  true);
 
       auto &pg_names = Outputs(kXGRAD);
       auto &p_names = Inputs(kX);
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index e25efebe6c3555958f4f75e2b87b7dc45d4a4177..5af8af640e43a5b2e5ee9856f09f66a9fdf4463c 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -51,7 +51,7 @@ ENDIF()
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
+cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc
     place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index c6f1d1f3d544117311821d980300dffea03891a5..dfc079e986e93c7f02f17b299e5d6293edbedd05 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -210,11 +210,14 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
   if (dynload::HasCUDNN()) {
     cudnn_holder_.reset(new CudnnHolder(&stream_, place));
   }
+
+  callback_manager_.reset(new StreamCallbackManager(stream_));
 }
 
 CUDADeviceContext::~CUDADeviceContext() {
   SetDeviceId(place_.device);
   Wait();
+  WaitStreamCallback();
   PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
   eigen_stream_.reset();
   eigen_device_.reset();
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 3ed49fc4233d4c0cd6cc16319eda08480ab9b434..79539195157d74d4d757edee5e008cbb76c93ee2 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -31,6 +31,9 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/stream_callback_manager.h"
+#endif
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
@@ -112,6 +115,17 @@ class CUDADeviceContext : public DeviceContext {
     PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
   }
 
+  template <typename Callback>
+  void AddStreamCallback(Callback&& callback) const {
+    std::lock_guard<std::mutex> guard(callback_mtx_);
+    callback_manager_->AddCallback(callback);
+  }
+
+  void WaitStreamCallback() const {
+    std::lock_guard<std::mutex> guard(callback_mtx_);
+    callback_manager_->Wait();
+  }
+
  private:
   CUDAPlace place_;
 
@@ -125,7 +139,12 @@ class CUDADeviceContext : public DeviceContext {
   int multi_process;
   int max_threads_per_mp;
 
-  std::mutex mtx_;
+  mutable std::mutex mtx_;
+
+  // This lock is only used by callback
+  // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
+  mutable std::mutex callback_mtx_;
+  std::unique_ptr<StreamCallbackManager> callback_manager_;
 };
 
 template <>
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c984065aa5fa1a8875aebe84051ab396bc417ec
--- /dev/null
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <functional>
+#include <memory>
+#include "ThreadPool.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+using StreamCallback = std::function<void(cudaStream_t, cudaError_t)>;
+
+class StreamCallbackManager;
+
+struct StreamCallbackContext {
+  template <typename Callback>
+  inline StreamCallbackContext(const StreamCallbackManager *manager,
+                               Callback &&callback)
+      : manager_(manager), callback_(callback) {}
+
+  const StreamCallbackManager *manager_;  // do not own
+  StreamCallback callback_;
+};
+
+class StreamCallbackManager {
+ public:
+  explicit inline StreamCallbackManager(cudaStream_t stream = nullptr)
+      : stream_(stream), thread_pool_(new ThreadPool(1)) {}
+
+  template <typename Callback>
+  inline void AddCallback(Callback &&callback) const {
+    AddCallbackWithStreamAndErrorInfo(
+        [=](cudaStream_t, cudaError_t) { callback(); });
+  }
+
+  template <typename Callback>
+  inline void AddCallbackWithStreamAndErrorInfo(Callback &&callback) const {
+    auto *stream_callback_context = new StreamCallbackContext(this, callback);
+    PADDLE_ENFORCE(cudaStreamAddCallback(
+        stream_, StreamCallbackManager::StreamCallbackFunc,
+        stream_callback_context, 0));
+  }
+
+  void Wait() const { thread_pool_.reset(new ThreadPool(1)); }
+
+ private:
+  const cudaStream_t stream_;
+  mutable std::unique_ptr<ThreadPool> thread_pool_;
+
+  // cudaStreamCallback cannot call CUDA API inside, so we have to use
+  // thread_pool here
+  static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
+                                           cudaError_t status,
+                                           void *user_data) {
+    auto *callback_context_ptr =
+        reinterpret_cast<StreamCallbackContext *>(user_data);
+    callback_context_ptr->manager_->thread_pool_->enqueue([=]() {
+      std::unique_ptr<StreamCallbackContext> callback_context(
+          callback_context_ptr);
+      callback_context->callback_(stream, status);
+    });
+  }
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 05b06d3677ce53752cc169cb93b89b408a81bde4..2822d2d4201119aee259345b92bd8815deb7007a 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -716,6 +716,12 @@ function main() {
         build_mac
         run_mac_test
         ;;
+      cicheck_py35)
+        cmake_gen ${PYTHON_ABI:-""}
+        build
+        run_test
+        assert_api_not_changed
+        ;;
       *)
         print_usage
         exit 0
diff --git a/python/paddle/dataset/sentiment.py b/python/paddle/dataset/sentiment.py
index 22d867beea25c97efcbcb6f61ca2b7a7777f9c5c..8051acb8812e201baba605cfa1cb3f74741e0d5a 100644
--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
@@ -67,7 +67,7 @@ def get_word_dict():
         for field in movie_reviews.fileids(category):
             for words in movie_reviews.words(field):
                 word_freq_dict[words] += 1
-    words_sort_list = six.iteritems(word_freq_dict)
+    words_sort_list = list(six.iteritems(word_freq_dict))
     words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
     for index, word in enumerate(words_sort_list):
         words_freq_sorted.append((word[0], index))
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 9aac3c7fc16ae1ded2700662764895385b043130..1ca2ac2ddc7daef3f4c0ea2004a62258ae4610ac 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -122,7 +122,7 @@ def __bootstrap__():
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
         'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'cpu_deterministic'
+        "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py
index 08579c7dd62ea6aea87b053345211914a6be6237..82e704169e4b828549dab4c47b0fa46d9afd8f7e 100644
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -109,15 +109,20 @@ def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
     return t
 
 
+from paddle.fluid.transpiler.details import op_to_code
+
+
 def operator_equal(a, b):
+    if op_to_code(a) != op_to_code(b):
+        raise ValueError("In operator_equal not equal\n")
+
     for k, v in six.iteritems(a.__dict__):
         if isinstance(v, fluid.framework.Program) or \
                 isinstance(v, fluid.framework.Block):
             continue
 
         elif isinstance(v, core.OpDesc):
-            if v.serialize_to_string() != b.__dict__[k].serialize_to_string():
-                raise ValueError("In operator_equal not equal:{0}\n".format(k))
+            continue
 
         elif isinstance(v, collections.OrderedDict):
             v0 = sorted(list(six.iteritems(v)), key=lambda x: x[0])
diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
index b75373cf24a7344bf59b3c6fcb9c4c3969be6503..43fd9d425bffb1e0198f4e845da959570a964990 100644
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -30,8 +30,10 @@ class TestWhileOp(unittest.TestCase):
             "d1", shape=[10], append_batch_size=False, dtype='float32')
         d2 = layers.data(
             "d2", shape=[10], append_batch_size=False, dtype='float32')
+
         i = layers.zeros(shape=[1], dtype='int64')
         i.stop_gradient = True
+
         init = layers.zeros(shape=[10], dtype='float32')
         mem_array = layers.array_write(x=init, i=i)
         data_array = layers.array_write(x=d0, i=i)
@@ -45,11 +47,19 @@ class TestWhileOp(unittest.TestCase):
         i = layers.zeros(shape=[1], dtype='int64')
         i.stop_gradient = True
 
-        array_len = layers.fill_constant(shape=[1], dtype='int64', value=3)
+        array_len = layers.fill_constant(shape=[1], dtype='int64', value=1)
         array_len.stop_gradient = True
         cond = layers.less_than(x=i, y=array_len)
 
+        j = layers.fill_constant(shape=[1], dtype='int64', value=1)
+        j.stop_gradient = True
+
+        array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
+        array_len2.stop_gradient = True
+        cond2 = layers.less_than(x=j, y=array_len2)
+
         while_op = layers.While(cond=cond)
+        while_op2 = layers.While(cond=cond2)
         with while_op.block():
             d = layers.array_read(array=data_array, i=i)
             prev = layers.array_read(array=mem_array, i=i)
@@ -59,7 +69,16 @@ class TestWhileOp(unittest.TestCase):
             layers.array_write(result, i=i, array=mem_array)
             layers.less_than(x=i, y=array_len, cond=cond)
 
-        sum_result = layers.array_read(array=mem_array, i=i)
+            with while_op2.block():
+                d2 = layers.array_read(array=data_array, i=j)
+                prev2 = layers.array_read(array=mem_array, i=j)
+                result2 = layers.sums(input=[d2, prev2])
+
+                j = layers.increment(x=j, in_place=True)
+                layers.array_write(result2, i=j, array=mem_array)
+                layers.less_than(x=j, y=array_len2, cond=cond2)
+
+        sum_result = layers.array_read(array=mem_array, i=j)
         loss = layers.mean(sum_result)
 
         append_backward(loss)
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
index a83aa0f11eed9bfc1674d8d75dcfacc297f056b0..200175cfe87e24a53e1e229e41d1ff2a25fd66ec 100644
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -113,27 +113,32 @@ def op_to_code(op):
             inputs_str += ", "
     inputs_str += "}"
 
+    attr_names = sorted(op.attr_names)
     attrs_str = ""
-    for i in range(0, len(op.attr_names)):
-        name = op.attr_names[i]
+    for i in range(0, len(attr_names)):
+        name = attr_names[i]
 
         attr_type = op.desc.attr_type(name)
         if attr_type == core.AttrType.BLOCK:
             a = "{name} = block[{value}]".format(
                 name=name, type=attr_type, value=op.block_attr_id(name))
             attrs_str += a
+            if i != len(attr_names) - 1:
+                attrs_str += ", "
             continue
 
         if attr_type == core.AttrType.BLOCKS:
             a = "{name} = blocks{value}".format(
                 name=name, type=attr_type, value=op.blocks_attr_ids(name))
             attrs_str += a
+            if i != len(attr_names) - 1:
+                attrs_str += ", "
             continue
 
         a = "{name} = {value}".format(
             name=name, type=attr_type, value=op.desc.attr(name))
         attrs_str += a
-        if i != len(op.attr_names) - 1:
+        if i != len(attr_names) - 1:
             attrs_str += ", "
 
     if outputs_str != "{}":