Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_fuse_var_op_handle

8149b0a9 · yuyang18 · a584bc86 · 50104f18 · 8149b0a9 · 8149b0a9
81 changed file
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -29,9 +29,11 @@ Currently supported `--model` argument include:
    You can choose to use GPU/CPU training. With GPU training, you can specify
    `--gpus <gpu_num>` to run multi GPU training.
 * Run distributed training with parameter servers:
+    * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
    * start parameter servers:
        ```bash
        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
+        sleep 15
        ```
    * start trainers:
        ```bash

--- a/benchmark/fluid/run_fluid_benchmark.sh
+++ b/benchmark/fluid/run_fluid_benchmark.sh
+#!/bin/bash
+PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 &
+sleep 15
+CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
+CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -87,7 +87,7 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
 framework_proto glog lod_rank_table feed_fetch_method)
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -7,6 +7,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
+cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
@@ -29,6 +30,9 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
+cc_library(graph_builder_factory SRCS graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)
@@ -37,5 +41,6 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha
        device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
        device_context gather_op_handle)
+cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
 #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
 #        device_context reduce_op_handle )
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -59,8 +59,8 @@ struct BroadcastOpHandle : public OpHandleBase {
  void RunImpl() override;
 private:
-  const std::vector<Scope *> &local_scopes_;
+  std::vector<Scope *> local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<platform::Place> places_;
 #ifdef PADDLE_WITH_CUDA
  const platform::NCCLContextMap *nccl_ctxs_;
 #endif

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -14,6 +14,8 @@
 #pragma once
+#include <string>
 namespace paddle {
 namespace framework {
 namespace details {
@@ -29,6 +31,8 @@ struct BuildStrategy {
  ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
  GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
+  std::string debug_graphviz_path_{""};
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -22,6 +22,7 @@ struct ExecutionStrategy {
  size_t num_threads_{0};
  bool use_event_{true};
  bool allow_op_delay_{false};
+  size_t num_iteration_per_drop_scope_{100};
 };
 }  //  namespace details

--- a/paddle/fluid/framework/details/graph_builder_factory.cc
+++ b/paddle/fluid/framework/details/graph_builder_factory.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/graph_builder_factory.h"
+#include <fstream>
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+namespace paddle {
+namespace framework {
+namespace details {
+std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
+  std::unique_ptr<SSAGraphBuilder> res(
+#ifdef PADDLE_WITH_CUDA
+      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
+                                  local_scopes_, nccl_ctxs_, strategy_)
+#else
+      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
+                                  local_scopes_, strategy_)
+#endif
+          );  // NOLINT
+  if (!strategy_.debug_graphviz_path_.empty()) {
+    std::unique_ptr<std::ostream> fout(
+        new std::ofstream(strategy_.debug_graphviz_path_));
+    PADDLE_ENFORCE(fout->good());
+    std::unique_ptr<GraphvizSSAGraphPrinter> graphviz_printer(
+        new GraphvizSSAGraphPrinter());
+    res.reset(new SSAGraghBuilderWithPrinter(
+        std::move(fout), std::move(graphviz_printer), std::move(res)));
+  }
+  return res;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/graph_builder_factory.h
+++ b/paddle/fluid/framework/details/graph_builder_factory.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+namespace paddle {
+namespace framework {
+class Scope;
+namespace details {
+class SSAGraphBuilderFactory {
+ public:
+  SSAGraphBuilderFactory(const std::vector<platform::Place>& places,
+                         const std::string& loss_var_name,
+                         const std::unordered_set<std::string>& param_names,
+                         const std::vector<Scope*>& local_scopes,
+                         const BuildStrategy& strategy)
+      : places_(places),
+        loss_var_name_(loss_var_name),
+        param_names_(param_names),
+        local_scopes_(local_scopes),
+        strategy_(strategy) {}
+#ifdef PADDLE_WITH_CUDA
+  void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) {
+    nccl_ctxs_ = nccl_ctxs;
+  }
+#endif
+  std::unique_ptr<SSAGraphBuilder> Create();
+ private:
+  std::vector<platform::Place> places_;
+  std::string loss_var_name_;
+  std::unordered_set<std::string> param_names_;
+  std::vector<Scope*> local_scopes_;
+  BuildStrategy strategy_;
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap* nccl_ctxs_;
+#endif
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -30,10 +30,6 @@
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #endif
-DEFINE_string(ssa_graph_path, "/tmp/ssa_graph.dot",
-              "the ssa graph path only print with GLOG_v=10,"
-              "default /tmp/graph.dot");
 namespace paddle {
 namespace framework {
 namespace details {
@@ -277,11 +273,6 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   */
  AddOutputToLeafOps(&result);
-  if (VLOG_IS_ON(10)) {
-    std::ofstream fout(FLAGS_ssa_graph_path);
-    PrintGraphviz(*graph, fout);
-  }
  return std::unique_ptr<SSAGraph>(graph);
 }

--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -41,8 +41,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
  void RunImpl() override;
 private:
-  const std::vector<Scope *> local_scopes_;
+  std::vector<Scope *> local_scopes_;
-  const std::vector<platform::Place> places_;
+  std::vector<platform::Place> places_;
  const platform::NCCLContextMap &nccl_ctxs_;
 };

--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -32,8 +32,8 @@ namespace framework {
 namespace details {
 struct ReduceOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
+  std::vector<Scope *> local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<platform::Place> places_;
 #ifdef PADDLE_WITH_CUDA
  const platform::NCCLContextMap *nccl_ctxs_;

--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+namespace paddle {
+namespace framework {
+namespace details {
+ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
+    ExecutionStrategy strategy, std::vector<Scope *> local_scopes,
+    std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
+    std::unique_ptr<SSAGraphExecutor> &&underlying_executor)
+    : strategy_(std::move(strategy)),
+      underlying_executor_(std::move(underlying_executor)),
+      local_scopes_(std::move(local_scopes)),
+      var_infos_(std::move(var_infos)),
+      places_(std::move(places)) {}
+FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  if (drop_scope_counter_ == 0) {
+    // Create local scopes.
+    for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) {
+      auto &scope = *it;
+      Scope &local_scope = scope->NewScope();
+      *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
+          &local_scope;
+      for (auto &info : var_infos_) {
+        if (scope->FindVar(info.name_) != nullptr) {
+          continue;
+        }
+        if (info.persistable_) {  // Persistable
+          InitializeVariable(scope->Var(info.name_), info.type_);
+        } else {
+          InitializeVariable(local_scope.Var(info.name_), info.type_);
+        }
+      }
+    }
+  }
+  auto fetch_data = underlying_executor_->Run(fetch_tensors);
+  drop_scope_counter_ += 1;
+  if (!fetch_tensors.empty() ||
+      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+    drop_scope_counter_ = 0;
+    // Wait All computational streams
+    for (auto p : places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    }
+    for (auto &scope : local_scopes_) {
+      auto &local_scope =
+          *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
+      scope->DeleteScope(local_scope);
+    }
+  }
+  return fetch_data;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct VariableInfo {
+  std::string name_;
+  proto::VarType::Type type_;
+  bool persistable_;
+};
+class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  ScopeBufferedSSAGraphExecutor(
+      ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
+      std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
+      std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
+  FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
+ private:
+  size_t drop_scope_counter_{0};
+  ExecutionStrategy strategy_;
+  std::unique_ptr<SSAGraphExecutor> underlying_executor_;
+  std::vector<Scope*> local_scopes_;
+  std::vector<VariableInfo> var_infos_;
+  std::vector<platform::Place> places_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -73,64 +73,6 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
  op_handle->AddOutput(var);
 }
-template <typename Callback>
-void IterAllVar(const SSAGraph &graph, Callback callback) {
-  for (auto &each : graph.vars_) {
-    for (auto &pair1 : each) {
-      for (auto &pair2 : pair1.second) {
-        callback(*pair2);
-      }
-    }
-  }
-  for (auto &var : graph.dep_vars_) {
-    callback(*var);
-  }
-}
-void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
-  size_t var_id = 0;
-  std::unordered_map<const VarHandleBase *, size_t> vars;
-  sout << "digraph G {\n";
-  IterAllVar(graph, [&](const VarHandleBase &var) {
-    auto *var_ptr = &var;
-    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
-    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
-    size_t cur_var_id = var_id++;
-    vars[var_ptr] = cur_var_id;
-    if (var_handle_ptr) {
-      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
-           << "\\n"
-           << var_handle_ptr->place_ << "\\n"
-           << var_handle_ptr->version_ << "\"]" << std::endl;
-    } else if (dummy_ptr) {
-      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
-    }
-  });
-  size_t op_id = 0;
-  for (auto &op : graph.ops_) {
-    std::string op_name = "op_" + std::to_string(op_id++);
-    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
-         << std::endl;
-    for (auto in : op->Inputs()) {
-      std::string var_name = "var_" + std::to_string(vars[in]);
-      sout << var_name << " -> " << op_name << std::endl;
-    }
-    for (auto out : op->Outputs()) {
-      std::string var_name = "var_" + std::to_string(vars[out]);
-      sout << op_name << " -> " << var_name << std::endl;
-    }
-  }
-  sout << "}\n";
-}
 void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
  for (auto &op : graph->ops_) {
    if (!op->Outputs().empty()) {

--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -55,8 +55,6 @@ class SSAGraphBuilder {
                             const platform::Place &place, size_t place_offset);
  static void AddOutputToLeafOps(SSAGraph *graph);
-  static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
 };
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -17,10 +17,6 @@
 namespace paddle {
 namespace framework {
 namespace details {
-SSAGraphExecutor::SSAGraphExecutor(std::unique_ptr<SSAGraph> &&graph)
-    : graph_(std::move(graph)) {}
 SSAGraphExecutor::~SSAGraphExecutor() {}
 }  // namespace details

--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -28,15 +28,11 @@ class SSAGraphExecutor {
  DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
 public:
-  // Steal graph inside
+  SSAGraphExecutor() {}
-  explicit SSAGraphExecutor(std::unique_ptr<SSAGraph> &&graph);
  virtual ~SSAGraphExecutor();
  virtual FeedFetchList Run(const std::vector<std::string> &fetch_tensors) = 0;
- protected:
-  std::unique_ptr<SSAGraph> graph_;
 };
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/ssa_graph_printer.cc
+++ b/paddle/fluid/framework/details/ssa_graph_printer.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+#include <string>
+#include "paddle/fluid/framework/details/ssa_graph.h"
+namespace paddle {
+namespace framework {
+namespace details {
+template <typename Callback>
+static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
+  for (auto &each : graph.vars_) {
+    for (auto &pair1 : each) {
+      for (auto &pair2 : pair1.second) {
+        callback(*pair2);
+      }
+    }
+  }
+  for (auto &var : graph.dep_vars_) {
+    callback(*var);
+  }
+}
+void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
+                                    std::ostream &sout) const {
+  size_t var_id = 0;
+  std::unordered_map<const VarHandleBase *, size_t> vars;
+  sout << "digraph G {\n";
+  IterAllVar(graph, [&](const VarHandleBase &var) {
+    auto *var_ptr = &var;
+    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
+    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
+    size_t cur_var_id = var_id++;
+    vars[var_ptr] = cur_var_id;
+    if (var_handle_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
+           << "\\n"
+           << var_handle_ptr->place_ << "\\n"
+           << var_handle_ptr->version_ << "\"]" << std::endl;
+    } else if (dummy_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
+    }
+  });
+  size_t op_id = 0;
+  for (auto &op : graph.ops_) {
+    std::string op_name = "op_" + std::to_string(op_id++);
+    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
+         << std::endl;
+    for (auto in : op->Inputs()) {
+      std::string var_name = "var_" + std::to_string(vars[in]);
+      sout << var_name << " -> " << op_name << std::endl;
+    }
+    for (auto out : op->Outputs()) {
+      std::string var_name = "var_" + std::to_string(vars[out]);
+      sout << op_name << " -> " << var_name << std::endl;
+    }
+  }
+  sout << "}\n";
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <iosfwd>
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+namespace paddle {
+namespace framework {
+namespace details {
+class SSAGraph;
+class SSAGraphPrinter {
+ public:
+  virtual ~SSAGraphPrinter() {}
+  virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0;
+};
+class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
+ public:
+  void Print(const SSAGraph& graph, std::ostream& sout) const override;
+};
+class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
+ public:
+  SSAGraghBuilderWithPrinter(std::ostream& sout,
+                             std::unique_ptr<SSAGraphPrinter>&& printer,
+                             std::unique_ptr<SSAGraphBuilder>&& builder)
+      : printer_(std::move(printer)),
+        builder_(std::move(builder)),
+        stream_ref_(sout) {}
+  SSAGraghBuilderWithPrinter(std::unique_ptr<std::ostream>&& sout,
+                             std::unique_ptr<SSAGraphPrinter>&& printer,
+                             std::unique_ptr<SSAGraphBuilder>&& builder)
+      : printer_(std::move(printer)),
+        builder_(std::move(builder)),
+        stream_ptr_(std::move(sout)),
+        stream_ref_(*stream_ptr_) {}
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
+    auto graph = builder_->Build(program);
+    printer_->Print(*graph, stream_ref_);
+    return graph;
+  }
+ private:
+  std::unique_ptr<SSAGraphPrinter> printer_;
+  std::unique_ptr<SSAGraphBuilder> builder_;
+  std::unique_ptr<std::ostream> stream_ptr_;
+  std::ostream& stream_ref_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -21,7 +21,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places,
    std::unique_ptr<SSAGraph> &&graph)
-    : SSAGraphExecutor(std::move(graph)),
+    : graph_(std::move(graph)),
      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                       : nullptr),
      local_scopes_(local_scopes),
@@ -189,7 +189,9 @@ void ThreadedSSAGraphExecutor::RunOp(
    BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
  auto op_run = [ready_var_q, op, this] {
    try {
-      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
+      if (VLOG_IS_ON(10)) {
+        VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
+      }
      op->Run(strategy_.use_event_);
      VLOG(10) << op << " " << op->Name() << " Done ";
      running_ops_--;

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -51,6 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
             details::OpHandleBase *op);
 private:
+  std::unique_ptr<SSAGraph> graph_;
  std::unique_ptr<::ThreadPool> pool_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -22,7 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/graph_builder_factory.h"
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -42,8 +43,6 @@ class ParallelExecutorPrivate {
 #ifdef PADDLE_WITH_CUDA
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
-  std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
  bool own_local_scope;
 };
@@ -92,29 +91,34 @@ ParallelExecutor::ParallelExecutor(
      local_scopes.empty()) {  // Is CUDA
    BCastParamsToGPUs(bcast_vars);
  }
-// Startup Program has been run. All local scopes has correct parameters.
+  // Startup Program has been run. All local scopes has correct parameters.
-// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
+  // Step 2. Create vars in each scope;
-// ncclOp
+  std::vector<details::VariableInfo> var_infos;
-#ifdef PADDLE_WITH_CUDA
+  for (auto *var : main_program.Block(0).AllVars()) {
-  details::MultiDevSSAGraphBuilder builder(
+    var_infos.emplace_back();
+    var_infos.back().name_ = var->Name();
+    var_infos.back().type_ = var->GetType();
+    var_infos.back().persistable_ = var->Persistable();
+  }
+  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
+  // ncclOp
+  details::SSAGraphBuilderFactory builder_factory(
      member_->places_, loss_var_name, params, member_->local_scopes_,
-      member_->nccl_ctxs_.get(), build_strategy);
+      build_strategy);
-#else
+#ifdef PADDLE_WITH_CUDA
-  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
+  builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
-                                           params, member_->local_scopes_,
-                                           build_strategy);
 #endif
-  auto graph = builder.Build(main_program);
  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      exec_strategy, member_->local_scopes_, places, std::move(graph)));
+      exec_strategy, member_->local_scopes_, places,
+      builder_factory.Create()->Build(main_program)));
-  // Step 3. Create vars in each scope;
+  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
-  for (auto *var : main_program.Block(0).AllVars()) {
+      exec_strategy, member_->local_scopes_, std::move(var_infos),
-    member_->var_types_.emplace_back(var->Name(), var->GetType(),
+      member_->places_, std::move(member_->executor_)));
-                                     var->Persistable());
-  }
 }
 void ParallelExecutor::BCastParamsToGPUs(
@@ -169,42 +173,9 @@ void ParallelExecutor::BCastParamsToGPUs(
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                           const std::string &fetched_var_name) {
  platform::RecordBlock b(0);
-  // Create local scopes.
-  for (auto it = member_->local_scopes_.rbegin();
-       it != member_->local_scopes_.rend(); ++it) {
-    auto &scope = *it;
-    Scope &local_scope = scope->NewScope();
-    *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
-        &local_scope;
-    for (auto &name_type_pair : member_->var_types_) {
-      if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) {
-        continue;
-      }
-      if (std::get<2>(name_type_pair)) {  // Persistable
-        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
-                           std::get<1>(name_type_pair));
-      } else {
-        InitializeVariable(local_scope.Var(std::get<0>(name_type_pair)),
-                           std::get<1>(name_type_pair));
-      }
-    }
-  }
  auto fetch_data = member_->executor_->Run(fetch_tensors);
  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
      fetch_data;
-  // Wait All computational streams
-  for (auto p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
-  for (auto &scope : member_->local_scopes_) {
-    auto &local_scope =
-        *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
-    scope->DeleteScope(local_scope);
-  }
 }
 void ParallelExecutor::FeedTensorsIntoLocalScopes(

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -34,13 +34,7 @@ DEFINE_bool(
 namespace paddle {
 namespace framework {
-Scope::~Scope() {
+Scope::~Scope() { DropKids(); }
-  DropKids();
-  for (auto& kv : vars_) {
-    VLOG(3) << "Destroy variable " << kv.first;
-    delete kv.second;
-  }
-}
 Scope& Scope::NewScope() const {
  std::unique_lock<std::mutex> lock(mutex_);
@@ -49,10 +43,13 @@ Scope& Scope::NewScope() const {
 }
 Variable* Scope::Var(const std::string& name) {
+  // acquire the lock when new var under this scope
+  std::unique_lock<std::mutex> lock(mutex_);
  auto* v = FindVarLocally(name);
  if (v != nullptr) return v;
  v = new Variable();
-  vars_[name] = v;
+  vars_[name].reset(v);
  VLOG(3) << "Create variable " << name;
  v->name_ = &(vars_.find(name)->first);
  return v;
@@ -67,22 +64,29 @@ Variable* Scope::Var(std::string* name) {
 }
 Variable* Scope::FindVar(const std::string& name) const {
+  // acquire the lock when find var
+  std::unique_lock<std::mutex> lock(mutex_);
+  return FindVarInternal(name);
+}
+Variable* Scope::FindVarInternal(const std::string& name) const {
  auto var = FindVarLocally(name);
  if (var != nullptr) {
    return var;
  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
+  return (parent_ == nullptr) ? nullptr : parent_->FindVarInternal(name);
 }
 const Scope* Scope::FindScope(const Variable* var) const {
  for (auto& kv : vars_) {
-    if (kv.second == var) {
+    if (kv.second.get() == var) {
      return this;
    }
  }
  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
 }
 void Scope::DropKids() {
+  std::unique_lock<std::mutex> lock(mutex_);
  for (Scope* s : kids_) delete s;
  kids_.clear();
 }
@@ -110,10 +114,10 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
+  std::unique_lock<std::mutex> lock(mutex_);
  std::set<std::string> var_set(var_names.begin(), var_names.end());
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
-      delete it->second;
      it = vars_.erase(it);
    } else {
      ++it;
@@ -129,7 +133,7 @@ void Scope::Rename(const std::string& origin_name,
  auto new_it = vars_.find(new_name);
  PADDLE_ENFORCE(new_it == vars_.end(),
                 "The variable with name %s is already in the scope", new_name);
-  vars_[new_name] = origin_it->second;
+  vars_[new_name].reset(origin_it->second.release());
  vars_.erase(origin_it);
 }
@@ -141,7 +145,7 @@ std::string Scope::Rename(const std::string& origin_name) const {
 Variable* Scope::FindVarLocally(const std::string& name) const {
  auto it = vars_.find(name);
-  if (it != vars_.end()) return it->second;
+  if (it != vars_.end()) return it->second.get();
  return nullptr;
 }

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -47,15 +47,18 @@ class Scope {
  Scope& NewScope() const;
  /// Create a variable with given name if it doesn't exist.
+  /// Caller doesn't own the returned Variable.
  Variable* Var(const std::string& name);
  /// Create a variable with a scope-unique name.
+  /// Caller doesn't own the returned Variable.
  Variable* Var(std::string* name = nullptr);
  void EraseVars(const std::vector<std::string>& var_names);
  /// Find a variable in the scope or any of its ancestors.  Returns
  /// nullptr if cannot find.
+  /// Caller doesn't own the returned Variable.
  Variable* FindVar(const std::string& name) const;
  const Scope* parent() const { return parent_; }
@@ -78,13 +81,21 @@ class Scope {
  // Rename variable to a new name and return the new name
  std::string Rename(const std::string& origin_name) const;
-  Variable* FindVarLocally(const std::string& name) const;
 private:
  // Call Scope::NewScope for a sub-scope.
  explicit Scope(Scope const* parent) : parent_(parent) {}
-  mutable std::unordered_map<std::string, Variable*> vars_;
+  // Called by FindVar recursively.
+  // Caller doesn't own the returned Variable.
+  Variable* FindVarInternal(const std::string& name) const;
+  // Called by FindVarInternal and Var.
+  // Caller doesn't own the returned Variable.
+  Variable* FindVarLocally(const std::string& name) const;
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+  // Scope in `kids_` are owned by this class.
  mutable std::list<Scope*> kids_;
  Scope const* parent_{nullptr};

--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -15,5 +15,102 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 namespace paddle {
-namespace framework {}
+namespace framework {
+extern size_t SizeOfType(std::type_index type);
+void Tensor::check_memory_size() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE_LE(
+      numel() * SizeOfType(type()), memory_size(),
+      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+      "first to re-allocate memory.\n"
+      "or maybe the required data-type mismatches the data already stored.");
+}
+size_t Tensor::memory_size() const {
+  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
+}
+void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+  if (holder_ != nullptr) {
+    holder_->set_type(type);
+  }
+  PADDLE_ENFORCE_GE(numel(), 0,
+                    "When calling this method, the Tensor's numel must be "
+                    "equal or larger than zero. "
+                    "Please check Tensor::Resize has been called first.");
+  int64_t size = numel() * SizeOfType(type);
+  /* some versions of boost::variant don't have operator!= */
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    if (platform::is_cpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size, type));
+    } else if (platform::is_gpu_place(place) ||
+               platform::is_cuda_pinned_place(place)) {
+#ifndef PADDLE_WITH_CUDA
+      PADDLE_THROW(
+          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
+    }
+#else
+      if (platform::is_gpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+            boost::get<platform::CUDAPlace>(place), size, type));
+      } else if (platform::is_cuda_pinned_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
+            boost::get<platform::CUDAPinnedPlace>(place), size, type));
+      }
+    }
+#endif
+    offset_ = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+}
+void* Tensor::mutable_data(platform::Place place) {
+  PADDLE_ENFORCE(this->holder_ != nullptr,
+                 "Cannot invoke mutable data if current hold nothing.");
+  return mutable_data(place, holder_->type());
+}
+Tensor& Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size();
+  *this = src;
+  return *this;
+}
+Tensor Tensor::Slice(int begin_idx, int end_idx) const {
+  check_memory_size();
+  PADDLE_ENFORCE_GE(begin_idx, 0,
+                    "The start row index must be greater than 0.");
+  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
+  PADDLE_ENFORCE_LT(
+      begin_idx, end_idx,
+      "The start row index must be lesser than the end row index.");
+  if (dims_[0] == 1) {
+    return *this;
+  } else {
+    size_t base = numel() / dims_[0];
+    Tensor dst;
+    dst.holder_ = holder_;
+    dst.set_layout(layout_);
+    DDim dst_dims = dims_;
+    dst_dims[0] = end_idx - begin_idx;
+    dst.Resize(dst_dims);
+    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
+    return dst;
+  }
+}
+Tensor& Tensor::Resize(const DDim& dims) {
+  dims_ = dims;
+  return *this;
+}
+const DDim& Tensor::dims() const { return dims_; }
+int64_t Tensor::numel() const { return product(dims_); }
+}  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -54,26 +54,24 @@ class Tensor {
  /*! Return a pointer to mutable memory block. */
  template <typename T>
-  inline T* data();
+  T* data();
  /*! Return a pointer to constant memory block. */
  template <typename T>
-  inline const T* data() const;
+  const T* data() const;
-  inline bool IsInitialized() const;
+  bool IsInitialized() const;
-  inline void switch_place(platform::Place new_place);
  /**
   * @brief   Return a pointer to mutable memory block.
   * @note    If not exist, then allocation.
   */
  template <typename T>
-  inline T* mutable_data(platform::Place place);
+  T* mutable_data(platform::Place place);
-  inline void* mutable_data(platform::Place place, std::type_index type);
+  void* mutable_data(platform::Place place, std::type_index type);
-  inline void* mutable_data(platform::Place place);
+  void* mutable_data(platform::Place place);
  /**
   * @brief     Return a pointer to mutable memory block.
@@ -84,19 +82,19 @@ class Tensor {
   * @note      If not exist, then allocation.
   */
  template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place);
+  T* mutable_data(DDim dims, platform::Place place);
  /*! Return the dimensions of the memory block. */
-  inline const DDim& dims() const;
+  const DDim& dims() const;
  /*! Return the numel of the memory block. */
-  inline int64_t numel() const;
+  int64_t numel() const;
  /*! Resize the dimensions of the memory block. */
-  inline Tensor& Resize(const DDim& dims);
+  Tensor& Resize(const DDim& dims);
  /*! The internal of two tensors share the same memory block. */
-  inline Tensor& ShareDataWith(const Tensor& src);
+  Tensor& ShareDataWith(const Tensor& src);
  /**
   * @brief  Return a sub-tensor of the given tensor.
@@ -106,7 +104,7 @@ class Tensor {
   * @param[in] end_idx     The index of the end row(exclusive) to slice.
   *                        The index number begins from 0.
   */
-  inline Tensor Slice(int begin_idx, int end_idx) const;
+  Tensor Slice(int begin_idx, int end_idx) const;
  platform::Place place() const {
    PADDLE_ENFORCE_NOT_NULL(
@@ -123,11 +121,11 @@ class Tensor {
  // memory size returns the holding memory size in byte.
  size_t memory_size() const;
-  inline void check_memory_size() const;
+  void check_memory_size() const;
-  inline DataLayout layout() const { return layout_; }
+  DataLayout layout() const { return layout_; }
-  inline void set_layout(const DataLayout layout) { layout_ = layout; }
+  void set_layout(const DataLayout layout) { layout_ = layout; }
 private:
  /**
@@ -210,15 +208,6 @@ class Tensor {
  size_t offset_;
 };
-inline void Tensor::switch_place(platform::Place new_place) {
-  if (holder_->place() == new_place) {
-    return;
-  }
-  // TODO(tonyyang-svail): do memcpy here.
-  PADDLE_THROW("Not Implemented");
-}
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -20,21 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-extern size_t SizeOfType(std::type_index type);
-inline void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
-  PADDLE_ENFORCE_LE(
-      numel() * SizeOfType(type()), memory_size(),
-      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-      "first to re-allocate memory.\n"
-      "or maybe the required data-type mismatches the data already stored.");
-}
-inline size_t Tensor::memory_size() const {
-  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
-}
 template <typename T>
 inline const T* Tensor::data() const {
  check_memory_size();
@@ -73,88 +58,6 @@ inline T* Tensor::mutable_data(platform::Place place) {
  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
 }
-inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
-  if (holder_ != nullptr) {
-    holder_->set_type(type);
-  }
-  PADDLE_ENFORCE_GE(numel(), 0,
-                    "When calling this method, the Tensor's numel must be "
-                    "equal or larger than zero. "
-                    "Please check Tensor::Resize has been called first.");
-  int64_t size = numel() * SizeOfType(type);
-  /* some versions of boost::variant don't have operator!= */
-  if (holder_ == nullptr || !(holder_->place() == place) ||
-      holder_->size() < size + offset_) {
-    if (platform::is_cpu_place(place)) {
-      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type));
-    } else if (platform::is_gpu_place(place) ||
-               platform::is_cuda_pinned_place(place)) {
-#ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW(
-          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
-    }
-#else
-      if (platform::is_gpu_place(place)) {
-        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-            boost::get<platform::CUDAPlace>(place), size, type));
-      } else if (platform::is_cuda_pinned_place(place)) {
-        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
-            boost::get<platform::CUDAPinnedPlace>(place), size, type));
-      }
-    }
-#endif
-    offset_ = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-}
-inline void* Tensor::mutable_data(platform::Place place) {
-  PADDLE_ENFORCE(this->holder_ != nullptr,
-                 "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type());
-}
-inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
-  src.check_memory_size();
-  *this = src;
-  return *this;
-}
-inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
-  check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx, 0,
-                    "The start row index must be greater than 0.");
-  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
-  PADDLE_ENFORCE_LT(
-      begin_idx, end_idx,
-      "The start row index must be lesser than the end row index.");
-  if (dims_[0] == 1) {
-    return *this;
-  } else {
-    size_t base = numel() / dims_[0];
-    Tensor dst;
-    dst.holder_ = holder_;
-    dst.set_layout(layout_);
-    DDim dst_dims = dims_;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
-    return dst;
-  }
-}
-inline Tensor& Tensor::Resize(const DDim& dims) {
-  dims_ = dims;
-  return *this;
-}
-inline const DDim& Tensor::dims() const { return dims_; }
-inline int64_t Tensor::numel() const { return product(dims_); }
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
  Tensor res;
  res.ShareDataWith(src);

--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
@@ -107,6 +109,13 @@ class OrderedRegistry {
  std::vector<std::unique_ptr<T>> data_;
 };
+template <typename T>
+T &GetFromScope(const framework::Scope &scope, const std::string &name) {
+  framework::Variable *var = scope.FindVar(name);
+  PADDLE_ENFORCE(var != nullptr);
+  return *var->GetMutable<T>();
+}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
 # Add TRT tests
-nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine)
 # This test is not stable
 # See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828 
 #nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
 #    DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
 #    SERIAL)
+nv_library(tensorrt_converter
+  SRCS mul_op.cc conv2d_op.cc fc_op.cc
+  DEPS tensorrt_engine mul_op)
+nv_test(test_op_converter SRCS test_op_converter.cc DEPS
+  ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
 nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)

--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 namespace paddle {
@@ -36,8 +37,8 @@ class ReluOpConverter : public OpConverter {
  }
 };
-REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -22,14 +22,14 @@ class Conv2dOpConverter : public OpConverter {
 public:
  Conv2dOpConverter() {}
  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope) override {
+                  const framework::Scope& scope, bool test_mode) override {
    LOG(INFO)
        << "convert a fluid conv2d op to tensorrt conv layer without bias";
  }
 };
-REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -56,7 +56,7 @@ void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
 class FcOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope) override {
+                  const framework::Scope& scope, bool test_mode) override {
    VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
    framework::OpDesc op_desc(op, nullptr);
@@ -106,14 +106,16 @@ class FcOpConverter : public OpConverter {
                                       n_output, weight.get(), bias.get());
    auto output_name = op_desc.Output("Out").front();
-    engine_->DeclareOutput(layer, 0, output_name);
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
  }
 };
-REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
 USE_OP(mul);
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -23,9 +23,8 @@ namespace tensorrt {
 */
 class MulOpConverter : public OpConverter {
 public:
-  MulOpConverter() {}
  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope) override {
+                  const framework::Scope& scope, bool test_mode) override {
    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
    framework::OpDesc op_desc(op, nullptr);
@@ -37,12 +36,18 @@ class MulOpConverter : public OpConverter {
        engine_, MatrixMultiply, *const_cast<nvinfer1::ITensor*>(input1), false,
        *const_cast<nvinfer1::ITensor*>(input2), false);
-    engine_->DeclareOutput(layer, 0, op_desc.Output("Out")[0]);
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
  }
 };
-REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+USE_OP(mul);
+REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
@@ -34,12 +35,15 @@ class OpConverter {
  // Converter logic for an op.
  virtual void operator()(const framework::proto::OpDesc& op,
-                          const framework::Scope& scope) {}
+                          const framework::Scope& scope,
+                          bool test_mode = false) {}
-  // Convert a single fluid operaotr and add the corresponding layer to TRT.
+  // Convert a single fluid operator and add the corresponding layer to TRT.
+  // test_mode: whether the instance executes in an unit test.
  void ConvertOp(const framework::proto::OpDesc& op,
                 const std::unordered_set<std::string>& parameters,
-                 const framework::Scope& scope, TensorRTEngine* engine) {
+                 const framework::Scope& scope, TensorRTEngine* engine,
+                 bool test_mode = false) {
    framework::OpDesc op_desc(op, nullptr);
    OpConverter* it{nullptr};
@@ -57,7 +61,7 @@ class OpConverter {
    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
                            op_desc.Type());
    it->SetEngine(engine);
-    (*it)(op, scope);
+    (*it)(op, scope, test_mode);
  }
  // convert fluid block to tensorrt network
@@ -77,6 +81,9 @@ class OpConverter {
  // TensorRT engine
  TensorRTEngine* engine_{nullptr};
+ protected:
+  bool test_mode_;
 private:
  // registered op converter map, whose key is the fluid op type, and value is
  // the pointer position of corresponding OpConverter class.
@@ -85,13 +92,24 @@ class OpConverter {
  framework::Scope* scope_{nullptr};
 };
-#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)       \
+#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)                      \
-  struct trt_##op_type__##_converter {                          \
+  struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
-    trt_##op_type__##_converter() {                             \
+    trt_##op_type__##_converter() {                                            \
-      Registry<OpConverter>::Register<Converter__>(#op_type__); \
+      ::paddle::inference::                                                    \
-    }                                                           \
+          Registry<paddle::inference::tensorrt::OpConverter>::Register<        \
-  };                                                            \
+              ::paddle::inference::tensorrt::Converter__>(#op_type__);         \
-  trt_##op_type__##_converter trt_##op_type__##_converter__;
+    }                                                                          \
+  };                                                                           \
+  trt_##op_type__##_converter trt_##op_type__##_converter__;                   \
+  int TouchConverterRegister_##op_type__() {                                   \
+    trt_##op_type__##_converter__.Touch();                                     \
+    return 0;                                                                  \
+  }
+#define USE_TRT_CONVERTER(op_type__)                                    \
+  extern int TouchConverterRegister_##op_type__();                      \
+  static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
+      TouchConverterRegister_##op_type__();
 }  // namespace tensorrt
 }  // namespace inference

--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -36,3 +36,5 @@ TEST(OpConverter, ConvertBlock) {
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+USE_TRT_CONVERTER(conv2d)
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -27,6 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
 namespace paddle {
 namespace inference {
@@ -104,8 +105,8 @@ class TRTConvertValidation {
  void SetOp(const framework::proto::OpDesc& desc) {
    op_ = framework::OpRegistry::CreateOp(desc);
-    OpConverter op_converter;
+    Singleton<OpConverter>::Global().ConvertOp(
-    op_converter.ConvertOp(desc, parameters_, scope_, engine_.get());
+        desc, parameters_, scope_, engine_.get(), true /*test_mode*/);
    engine_->FreezeNetwork();

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -43,9 +43,10 @@ void TensorRTEngine::Execute(int batch_size) {
 }
 TensorRTEngine::~TensorRTEngine() {
+  cudaStreamSynchronize(*stream_);
  // clean buffer
  for (auto& buf : buffers_) {
-    if (buf.buffer != nullptr) {
+    if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
      PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
      buf.buffer = nullptr;
      buf.max_size = 0;
@@ -80,6 +81,8 @@ void TensorRTEngine::FreezeNetwork() {
    auto& buf = buffer(item.first);
    CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
+    VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
+            << buf.buffer;
    buf.size = buf.max_size = item.second;
    buf.device = DeviceType::GPU;
  }
@@ -96,6 +99,7 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
  PADDLE_ENFORCE(input, "infer network add input %s failed", name);
  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
                        analysis::AccuDims(dims.d, dims.nbDims);
+  PADDLE_ENFORCE(input->isNetworkInput());
  TensorRTEngine::SetITensor(name, input);
  return input;
 }
@@ -109,7 +113,9 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
  SetITensor(name, output);
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
+  PADDLE_ENFORCE(!output->isNetworkInput());
  infer_network_->markOutput(*output);
+  PADDLE_ENFORCE(output->isNetworkOutput());
  // output buffers' size can only be decided latter, set zero here to mark this
  // and will reset latter.
  buffer_sizes_[name] = 0;
@@ -122,6 +128,7 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
  auto* output = TensorRTEngine::GetITensor(name);
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
+  PADDLE_ENFORCE(!output->isNetworkInput());
  infer_network_->markOutput(*output);
  // output buffers' size can only be decided latter, set zero here to mark this
  // and will reset latter.

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/utils/singleton.h"
 namespace paddle {
 namespace inference {
@@ -131,7 +132,11 @@ class TensorRTEngine : public EngineBase {
  // TensorRT related internal members
  template <typename T>
  struct Destroyer {
-    void operator()(T* x) { x->destroy(); }
+    void operator()(T* x) {
+      if (x) {
+        x->destroy();
+      }
+    }
  };
  template <typename T>
  using infer_ptr = std::unique_ptr<T, Destroyer<T>>;
@@ -155,6 +160,27 @@ class TensorRTEngine : public EngineBase {
 #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
  engine__->network()->add##layer__(ARGS);
+/*
+ * Helper to control the TensorRT engine's creation and deletion.
+ */
+class TRT_EngineManager {
+ public:
+  TensorRTEngine* Create(int max_batch, int max_workspace,
+                         cudaStream_t* stream) {
+    engines_.emplace_back(new TensorRTEngine(max_batch, max_workspace, stream));
+    return engines_.back().get();
+  }
+  void DeleteALl() {
+    for (auto& ptr : engines_) {
+      ptr.reset(nullptr);
+    }
+  }
+ private:
+  std::vector<std::unique_ptr<TensorRTEngine>> engines_;
+};
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -101,23 +101,22 @@ void SplitData(
 }
 void ThreadRunInfer(
-    const int tid, paddle::framework::Executor* executor,
+    const int tid, paddle::framework::Scope* scope,
-    paddle::framework::Scope* scope,
-    const std::unique_ptr<paddle::framework::ProgramDesc>& inference_program,
    const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
-  auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
+  // maybe framework:ProgramDesc is not thread-safe
-      new paddle::framework::ProgramDesc(*inference_program));
  auto& sub_scope = scope->NewScope();
+  auto place = paddle::platform::CPUPlace();
+  auto executor = paddle::framework::Executor(place);
+  auto inference_program =
+      paddle::inference::Load(&executor, scope, FLAGS_model_path);
-  std::string feed_holder_name = "feed_" + paddle::string::to_string(tid);
+  auto ctx = executor.Prepare(*inference_program, /*block_id*/ 0);
-  std::string fetch_holder_name = "fetch_" + paddle::string::to_string(tid);
+  executor.CreateVariables(*inference_program, &sub_scope, /*block_id*/ 0);
-  copy_program->SetFeedHolderName(feed_holder_name);
-  copy_program->SetFetchHolderName(fetch_holder_name);
  const std::vector<std::string>& feed_target_names =
-      copy_program->GetFeedTargetNames();
+      inference_program->GetFeedTargetNames();
  const std::vector<std::string>& fetch_target_names =
-      copy_program->GetFetchTargetNames();
+      inference_program->GetFetchTargetNames();
  PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
@@ -131,9 +130,8 @@ void ThreadRunInfer(
  auto start_ms = GetCurrentMs();
  for (size_t i = 0; i < inputs.size(); ++i) {
    feed_targets[feed_target_names[0]] = inputs[i];
-    executor->Run(*copy_program, &sub_scope, &feed_targets, &fetch_targets,
+    executor.RunPreparedContext(ctx.get(), &sub_scope, &feed_targets,
-                  true /*create_local_scope*/, true /*create_vars*/,
+                                &fetch_targets, false /*create_local_scope*/);
-                  feed_holder_name, fetch_holder_name);
  }
  auto stop_ms = GetCurrentMs();
  scope->DeleteScope(&sub_scope);
@@ -158,22 +156,10 @@ TEST(inference, nlp) {
  LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size();
  LOG(INFO) << "Total number of words: " << num_total_words;
-  const bool model_combined = false;
  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // 1. Define place, executor, scope
-  auto place = paddle::platform::CPUPlace();
-  auto executor = paddle::framework::Executor(place);
  std::unique_ptr<paddle::framework::Scope> scope(
      new paddle::framework::Scope());
-  // 2. Initialize the inference_program and load parameters
-  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
-  inference_program =
-      InitProgram(&executor, scope.get(), FLAGS_model_path, model_combined);
-  if (FLAGS_use_mkldnn) {
-    EnableMKLDNN(inference_program);
-  }
 #ifdef PADDLE_WITH_MKLML
  // only use 1 thread number per std::thread
  omp_set_dynamic(0);
@@ -189,21 +175,30 @@ TEST(inference, nlp) {
    start_ms = GetCurrentMs();
    for (int i = 0; i < FLAGS_num_threads; ++i) {
      threads.emplace_back(
-          new std::thread(ThreadRunInfer, i, &executor, scope.get(),
+          new std::thread(ThreadRunInfer, i, scope.get(), std::ref(jobs)));
-                          std::ref(inference_program), std::ref(jobs)));
    }
    for (int i = 0; i < FLAGS_num_threads; ++i) {
      threads[i]->join();
    }
    stop_ms = GetCurrentMs();
  } else {
-    if (FLAGS_prepare_vars) {
+    // 1. Define place, executor, scope
-      executor.CreateVariables(*inference_program, scope.get(), 0);
+    auto place = paddle::platform::CPUPlace();
+    auto executor = paddle::framework::Executor(place);
+    // 2. Initialize the inference_program and load parameters
+    std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+    inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path,
+                                    /*model combined*/ false);
+    if (FLAGS_use_mkldnn) {
+      EnableMKLDNN(inference_program);
    }
    // always prepare context
    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
    ctx = executor.Prepare(*inference_program, 0);
+    if (FLAGS_prepare_vars) {
+      executor.CreateVariables(*inference_program, scope.get(), 0);
+    }
    // preapre fetch
    const std::vector<std::string>& fetch_target_names =
        inference_program->GetFetchTargetNames();

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -227,6 +227,8 @@ op_library(softmax_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
    op_library(tensorrt_engine_op DEPS tensorrt_engine)
+    nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
+      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter)
 else()
    set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()

--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
 if(WITH_DISTRIBUTE)
  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-      request_handler_impl.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
+      request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
      selected_rows memory)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})

--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -25,29 +25,15 @@ namespace paddle {
 namespace operators {
 namespace detail {
-std::once_flag RPCClient::init_flag_;
+void GRPCClient::InitImpl() { InitEventLoop(); }
-std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
+void GRPCClient::InitEventLoop() {
-RPCClient* RPCClient::GetInstance() {
-  std::call_once(init_flag_, &RPCClient::Init);
-  return rpc_client_.get();
-}
-void RPCClient::Init() {
-  if (rpc_client_.get() == nullptr) {
-    rpc_client_.reset(new RPCClient());
-  }
-  rpc_client_->InitEventLoop();
-}
-void RPCClient::InitEventLoop() {
  // start the client process thread
  // TODO(wuyi): can make this in a threadpool
-  client_thread_.reset(new std::thread(std::bind(&RPCClient::Proceed, this)));
+  client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
 }
-RPCClient::~RPCClient() {
+GRPCClient::~GRPCClient() {
  Wait();
  cq_.Shutdown();
  {
@@ -59,11 +45,10 @@ RPCClient::~RPCClient() {
  client_thread_->join();
 }
-bool RPCClient::AsyncSendVariable(const std::string& ep,
+bool GRPCClient::AsyncSendVar(const std::string& ep,
-                                  const platform::DeviceContext& ctx,
+                              const platform::DeviceContext& ctx,
-                                  const framework::Scope& scope,
+                              const framework::Scope& scope,
-                                  const std::string& var_name,
+                              const std::string& var_name, int64_t time_out) {
-                                  int64_t time_out) {
  const platform::DeviceContext* p_ctx = &ctx;
  const std::string ep_val = ep;
  const std::string var_name_val = var_name;
@@ -113,11 +98,10 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
  result->Swap(&tmp);
 }
-bool RPCClient::AsyncGetVariable(const std::string& ep,
+bool GRPCClient::AsyncGetVar(const std::string& ep,
-                                 const platform::DeviceContext& ctx,
+                             const platform::DeviceContext& ctx,
-                                 const framework::Scope& scope,
+                             const framework::Scope& scope,
-                                 const std::string& var_name,
+                             const std::string& var_name, int64_t time_out) {
-                                 int64_t time_out) {
  const platform::DeviceContext* p_ctx = &ctx;
  const std::string ep_val = ep;
  const std::string var_name_val = var_name;
@@ -155,12 +139,12 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
  return true;
 }
-bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
+bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
+                                  const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
+                                  const framework::Scope& scope,
-                                      const std::string& in_var_name,
+                                  const std::string& in_var_name,
-                                      const std::string& out_var_name,
+                                  const std::string& out_var_name,
-                                      int64_t time_out) {
+                                  int64_t time_out) {
  const platform::DeviceContext* p_ctx = &ctx;
  const std::string ep_val = ep;
  const std::string in_var_name_val = in_var_name;
@@ -198,7 +182,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
  return true;
 }
-void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
+void GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+                                       int64_t time_out) {
  const auto ch = GetChannel(ep);
  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
@@ -211,7 +196,8 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
  req_count_++;
 }
-void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
+void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+                                       int64_t time_out) {
  const auto ch = GetChannel(ep);
  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
  s->Prepare(time_out);
@@ -223,12 +209,12 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
  req_count_++;
 }
-void RPCClient::Wait() {
+void GRPCClient::Wait() {
  std::unique_lock<std::mutex> lk(sync_mutex_);
  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
 }
-void RPCClient::Proceed() {
+void GRPCClient::Proceed() {
  void* tag = nullptr;
  bool ok = false;
@@ -251,7 +237,7 @@ void RPCClient::Proceed() {
  }
 }
-std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
+std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
  // TODO(Yancey1989): make grpc client completely thread-safe
  std::lock_guard<std::mutex> guard(chan_mutex_);
  auto it = channels_.find(ep);

--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -38,6 +38,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/detail/rpc_client.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
@@ -164,47 +165,46 @@ class FetchBarrierProcessor : public BaseProcessor {
  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
-class RPCClient {
+class GRPCClient : public RPCClient {
 public:
-  RPCClient() {}
+  GRPCClient() {}
-  ~RPCClient();
+  virtual ~GRPCClient();
-  static RPCClient* GetInstance();
+  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
+                    const framework::Scope& scope, const std::string& var_name,
+                    int64_t time_out = RPCClient::rpc_time_out) override;
-  bool AsyncSendVariable(const std::string& ep,
+  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
-                         const platform::DeviceContext& ctx,
+                   const framework::Scope& scope, const std::string& var_name,
-                         const framework::Scope& scope,
+                   int64_t time_out = RPCClient::rpc_time_out) override;
-                         const std::string& var_name,
-                         int64_t time_out = 600 * 1000);
-  bool AsyncGetVariable(const std::string& ep,
+  bool AsyncPrefetchVar(const std::string& ep,
                        const platform::DeviceContext& ctx,
                        const framework::Scope& scope,
-                        const std::string& var_name,
+                        const std::string& in_var_name,
-                        int64_t time_out = 600 * 1000);
+                        const std::string& out_var_name,
+                        int64_t time_out = RPCClient::rpc_time_out) override;
-  bool AsyncPrefetchVariable(const std::string& ep,
+  void AsyncSendBatchBarrier(
-                             const platform::DeviceContext& ctx,
+      const std::string& ep,
-                             const framework::Scope& scope,
+      int64_t time_out = RPCClient::rpc_time_out) override;
-                             const std::string& in_var_name,
-                             const std::string& out_var_name,
-                             int64_t time_out = 600 * 1000);
-  void AsyncSendBatchBarrier(const std::string& ep,
+  void AsyncSendFetchBarrier(
-                             int64_t time_out = 600 * 1000);
+      const std::string& ep,
+      int64_t time_out = RPCClient::rpc_time_out) override;
-  void AsyncSendFetchBarrier(const std::string& ep,
+  void Wait() override;
-                             int64_t time_out = 600 * 1000);
-  void Wait();
+ protected:
+  void InitImpl() override;
+ private:
  // InitEventLoop should only be called by Init()
  void InitEventLoop();
- private:
  void Proceed();
  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
-  // Init is called by GetInstance.
-  static void Init();
 private:
  grpc::CompletionQueue cq_;
@@ -218,9 +218,7 @@ class RPCClient {
  // mutex for GetChannel thread safety
  std::mutex chan_mutex_;
-  static std::unique_ptr<RPCClient> rpc_client_;
+  DISABLE_COPY_AND_ASSIGN(GRPCClient);
-  static std::once_flag init_flag_;
-  DISABLE_COPY_AND_ASSIGN(RPCClient);
 };
 }  // namespace detail

--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/detail/grpc_client.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/rpc_client.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -123,7 +124,8 @@ TEST(PREFETCH, CPU) {
  std::thread server_thread(StartServer);
  g_rpc_service->WaitServerReady();
-  detail::RPCClient* client = detail::RPCClient::GetInstance();
+  detail::RPCClient* client =
+      detail::RPCClient::GetInstance<detail::GRPCClient>();
  int port = g_rpc_service->GetSelectedPort();
  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
@@ -137,7 +139,7 @@ TEST(PREFETCH, CPU) {
    std::string in_var_name("ids");
    std::string out_var_name("out");
-    client->AsyncPrefetchVariable(ep, ctx, scope, in_var_name, out_var_name);
+    client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name);
    client->Wait();
    auto var = scope.Var(out_var_name);
    auto value = var->GetMutable<framework::SelectedRows>()->value();

--- a/paddle/fluid/operators/detail/rpc_client.cc
+++ b/paddle/fluid/operators/detail/rpc_client.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/detail/rpc_client.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+std::once_flag RPCClient::init_flag_;
+std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/detail/rpc_client.h
+++ b/paddle/fluid/operators/detail/rpc_client.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+class RPCClient {
+ public:
+  virtual bool AsyncSendVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
+                            int64_t time_out = rpc_time_out) = 0;
+  virtual bool AsyncGetVar(const std::string& ep,
+                           const platform::DeviceContext& ctx,
+                           const framework::Scope& scope,
+                           const std::string& var_name,
+                           int64_t time_out = rpc_time_out) = 0;
+  virtual bool AsyncPrefetchVar(const std::string& ep,
+                                const platform::DeviceContext& ctx,
+                                const framework::Scope& scope,
+                                const std::string& in_var_name,
+                                const std::string& out_var_name,
+                                int64_t time_out = rpc_time_out) = 0;
+  virtual void AsyncSendBatchBarrier(const std::string& ep,
+                                     int64_t time_out = rpc_time_out) = 0;
+  virtual void AsyncSendFetchBarrier(const std::string& ep,
+                                     int64_t time_out = rpc_time_out) = 0;
+  virtual void Wait() = 0;
+  static constexpr int64_t rpc_time_out = 120 * 1000;
+  template <typename T>
+  static RPCClient* GetInstance() {
+    std::call_once(init_flag_, &RPCClient::Init<T>);
+    return rpc_client_.get();
+  }
+  // Init is called by GetInstance.
+  template <typename T>
+  static void Init() {
+    if (rpc_client_.get() == nullptr) {
+      rpc_client_.reset(new T());
+      rpc_client_->InitImpl();
+    }
+  }
+ protected:
+  virtual void InitImpl() {}
+ private:
+  static std::once_flag init_flag_;
+  static std::unique_ptr<RPCClient> rpc_client_;
+};
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/rpc_client.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
@@ -43,7 +44,8 @@ class FetchBarrierOp : public framework::OperatorBase {
    // For profiling
    platform::RecordEvent record_event(Type(), &ctx);
-    auto rpc_client = detail::RPCClient::GetInstance();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
    rpc_client->Wait();

--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -61,12 +61,13 @@ class GenNCCLIdOp : public framework::OperatorBase {
    std::vector<std::string> endpoint_list =
        Attr<std::vector<std::string>>("endpoint_list");
-    detail::RPCClient client;
+    detail::RPCClient* client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
    for (auto& ep : endpoint_list) {
      VLOG(3) << "sending nccl id to " << ep;
-      client.AsyncSendVariable(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
+      client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
    }
-    client.Wait();
+    client->Wait();
    VLOG(3) << "sending completed...";
  }

--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -41,14 +41,14 @@ class PrefetchOp : public framework::OperatorBase {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& ctx = *pool.Get(place);
-    auto rpc_client = detail::RPCClient::GetInstance();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {
        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
                << outs[i] << " back";
-        rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i],
+        rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]);
-                                          outs[i]);
      } else {
        VLOG(3) << "don't send no-initialied variable: " << ins[i];
      }

--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -44,11 +44,12 @@ class RecvOp : public framework::OperatorBase {
    // For profiling
    platform::RecordEvent record_event(Type(), &ctx);
-    auto rpc_client = detail::RPCClient::GetInstance();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
    for (size_t i = 0; i < outs.size(); i++) {
      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
-      rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+      rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
    }
    if (sync_mode) {
      rpc_client->Wait();

--- a/paddle/fluid/operators/reverse_op.cc
+++ b/paddle/fluid/operators/reverse_op.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reverse_op.h"
+#include <vector>
+namespace paddle {
+namespace operators {
+class ReverseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    const auto& x_dims = ctx->GetInputDim("X");
+    const auto& axis = ctx->Attrs().Get<std::vector<int>>("axis");
+    PADDLE_ENFORCE(!axis.empty(), "'axis' can not be empty.");
+    for (int a : axis) {
+      PADDLE_ENFORCE_LT(a, x_dims.size(),
+                        "The axis must be less than input tensor's rank.");
+    }
+    ctx->SetOutputDim("Out", x_dims);
+  }
+};
+class ReverseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The LoDTensor to be flipped.");
+    AddOutput("Out", "The LoDTensor after flipping.");
+    AddAttr<std::vector<int>>(
+        "axis", "The axises that along which order of elements is reversed.");
+    AddComment(R"DOC(
+      Reverse Operator.
+      Reverse the order of elements in the input LoDTensor along given axises.
+      Case 1:
+        Given
+            X = [[1, 2, 3, 4, 5]
+                 [6, 7, 8, 9, 10]
+                 [11, 12, 13, 14, 15]],
+        and
+            axis = [0],
+        we get:
+            Out = [[11, 12, 13, 14, 15]
+                   [6, 7, 8, 9, 10]
+                   [1, 2, 3, 4, 5]].
+      Case 2:
+        Given
+            X = [[[1, 2, 3, 4]
+                  [5, 6, 7, 8]]
+                 [[9, 10, 11, 12]
+                  [13, 14, 15, 16]]],
+        and
+            axis = [0, 2],
+        we get:
+            Out = [[[12, 11, 10, 9]
+                    [16, 15, 14, 13]]
+                   [[4, 3, 2, 1]
+                    [8, 7, 6, 5]]],
+    )DOC");
+  }
+};
+class ReverseGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("reverse");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("axis", GetAttr("axis"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(reverse, ops::ReverseOp, ops::ReverseOpMaker,
+                  ops::ReverseGradMaker);
+REGISTER_OPERATOR(reverse_grad, ops::ReverseOp);
+REGISTER_OP_CPU_KERNEL(
+    reverse, ops::ReverseKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, uint8_t>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, double>)
--- a/paddle/fluid/operators/reverse_op.cu
+++ b/paddle/fluid/operators/reverse_op.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reverse_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    reverse, ops::ReverseKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, double>)
--- a/paddle/fluid/operators/reverse_op.h
+++ b/paddle/fluid/operators/reverse_op.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T, int Rank>
+struct ReverseFunctor {
+  void operator()(const DeviceContext& context, const framework::LoDTensor& in,
+                  framework::LoDTensor* out, const std::vector<int>& axis) {
+    Eigen::array<bool, Rank> reverse_axis;
+    for (int i = 0; i < Rank; ++i) {
+      reverse_axis[i] = false;
+    }
+    for (int a : axis) {
+      reverse_axis[a] = true;
+    }
+    auto in_eigen = framework::EigenTensor<T, Rank>::From(in);
+    auto out_eigen = framework::EigenTensor<T, Rank>::From(*out);
+    auto* dev = context.eigen_device();
+    out_eigen.device(*dev) = in_eigen.reverse(reverse_axis);
+  }
+};
+template <typename DeviceContext, typename T>
+class ReverseKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::LoDTensor>("X");
+    auto* out = context.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+    const auto& axis = context.Attr<std::vector<int>>("axis");
+    int rank = x->dims().size();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    switch (rank) {
+      case 1:
+        ReverseFunctor<DeviceContext, T, 1> functor1;
+        functor1(dev_ctx, *x, out, axis);
+        break;
+      case 2:
+        ReverseFunctor<DeviceContext, T, 2> functor2;
+        functor2(dev_ctx, *x, out, axis);
+        break;
+      case 3:
+        ReverseFunctor<DeviceContext, T, 3> functor3;
+        functor3(dev_ctx, *x, out, axis);
+        break;
+      case 4:
+        ReverseFunctor<DeviceContext, T, 4> functor4;
+        functor4(dev_ctx, *x, out, axis);
+        break;
+      case 5:
+        ReverseFunctor<DeviceContext, T, 5> functor5;
+        functor5(dev_ctx, *x, out, axis);
+        break;
+      case 6:
+        ReverseFunctor<DeviceContext, T, 6> functor6;
+        functor6(dev_ctx, *x, out, axis);
+        break;
+      default:
+        PADDLE_THROW(
+            "Reserve operator doesn't supports tensors whose ranks are greater "
+            "than 6.");
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -44,7 +44,8 @@ class SendBarrierOp : public framework::OperatorBase {
    // For profiling
    platform::RecordEvent record_event(Type(), &ctx);
-    auto rpc_client = detail::RPCClient::GetInstance();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
    VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;

--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -49,12 +49,13 @@ class SendOp : public framework::OperatorBase {
    // For profiling
    platform::RecordEvent record_event(Type(), &ctx);
-    auto rpc_client = detail::RPCClient::GetInstance();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {
        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
      } else {
        VLOG(3) << "don't send no-initialied variable: " << ins[i];
      }
@@ -72,7 +73,7 @@ class SendOp : public framework::OperatorBase {
    if (outs.size() > 0) {
      for (size_t i = 0; i < outs.size(); i++) {
        VLOG(2) << "getting " << outs[i] << " from " << epmap[i];
-        rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+        rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
      }
      rpc_client->Wait();
      // tell pservers that current trainer have called fetch

--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
@@ -45,14 +45,15 @@ class SendVarsOp : public framework::OperatorBase {
    // For profiling
    platform::RecordEvent record_event(Type(), &ctx);
-    auto rpc_client = detail::RPCClient::GetInstance();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {
        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
        // TODO(Yancey1989): we need to use an IO threadpool which has
        // a larger number of threads than the computing threadpool.
-        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
      } else {
        VLOG(3) << "don't send no-initialied variable: " << ins[i];
      }

--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -17,23 +17,93 @@
 #include "paddle/fluid/operators/tensorrt_engine_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 namespace paddle {
 namespace operators {
+using inference::Singleton;
+using inference::tensorrt::TRT_EngineManager;
+using FluidDT = framework::proto::VarType_Type;
+using TRT_DT = nvinfer1::DataType;
+namespace {
+TRT_DT FluidDataType2TRT(FluidDT type) {
+  switch (type) {
+    case FluidDT::VarType_Type_FP32:
+      return TRT_DT::kFLOAT;
+    case FluidDT::VarType_Type_INT32:
+      return TRT_DT::kINT32;
+    default:
+      return TRT_DT::kINT32;
+  }
+  PADDLE_THROW("unkown type");
+  return TRT_DT::kINT32;
+}
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
+  PADDLE_ENFORCE_GT(shape.size(), 1UL,
+                    "TensorRT' tensor input requires at least 2 dimensions");
+  PADDLE_ENFORCE_LE(shape.size(), 4UL,
+                    "TensorRT' tensor input requires at most 4 dimensions");
+  switch (shape.size()) {
+    case 2:
+      return nvinfer1::Dims2(shape[0], shape[1]);
+    case 3:
+      return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
+    case 4:
+      return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]);
+    default:
+      return nvinfer1::Dims();
+  }
+  return nvinfer1::Dims();
+}
+}  // namespace
 template <typename DeviceContext, typename T>
 void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
    const framework::ExecutionContext &context) const {
+  VLOG(4) << "Prepare engine";
  // Get the ProgramDesc and pass to convert.
-  const auto &block = context.Attr<framework::proto::BlockDesc>("subgraph");
+  framework::proto::BlockDesc block_desc;
+  block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
  max_batch_ = context.Attr<int>("max_batch");
  auto max_workspace = context.Attr<int>("max_workspace");
-  engine_.reset(new inference::tensorrt::TensorRTEngine(
+  engine_ = Singleton<TRT_EngineManager>::Global().Create(
-      max_batch_, max_workspace, nullptr));
+      max_batch_, max_workspace, &stream_);
+  engine_->InitNetwork();
+  framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
+  // Add inputs
+  VLOG(4) << "declare inputs";
+  for (auto &input : context.Inputs("Xs")) {
+    VLOG(4) << "declare input " << input;
+    auto *var = block.FindVar(input);
+    PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+                      "TensorRT engine only takes LoDTensor as input");
+    auto shape = var->GetShape();
+    engine_->DeclareInput(
+        input, FluidDataType2TRT(
+                   var->Proto()->type().lod_tensor().tensor().data_type()),
+        Vec2TRT_Dims(var->GetShape()));
+  }
  // TODO(Superjomn) parameters should be passed after analysised from outside.
  inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
-      block, {}, context.scope(), engine_.get());
+      block_desc, {}, context.scope(), engine_);
+  // Add outputs
+  VLOG(4) << "declare outputs";
+  for (auto &output : context.Outputs("Ys")) {
+    VLOG(4) << "declare output " << output;
+    engine_->DeclareOutput(output);
+  }
  engine_->FreezeNetwork();
 }
@@ -42,7 +112,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("Xs", "A list of inputs.").AsDuplicable();
    AddOutput("Ys", "A list of outputs").AsDuplicable();
-    AddAttr<std::string>("subgraph", "the subgraph");
+    AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<int>("max_batch", "the maximum batch size.");
+    AddAttr<int>("max_workspace", "the maximum batch size.");
    AddComment("TensorRT engine operator.");
  }
 };

--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -32,9 +32,12 @@ class TensorRTEngineOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
+    auto input0 = ctx.Inputs("Xs").front();
    framework::OpKernelType kt = framework::OpKernelType(
-        framework::ToDataType(
+        framework::ToDataType(ctx.scope()
-            ctx.Input<framework::LoDTensor>("pre_ids")->type()),
+                                  .FindVar(input0)
+                                  ->GetMutable<framework::LoDTensor>()
+                                  ->type()),
        platform::CPUPlace());
    return kt;
  }
@@ -50,17 +53,16 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
    auto input_names = context.op().Inputs("Xs");
    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
    // Try to determine a batch_size
-    auto* tensor0 = context.Input<framework::LoDTensor>(input_names.front());
+    auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
-    PADDLE_ENFORCE_NOT_NULL(tensor0);
+        context.scope(), input_names.front());
-    int batch_size = tensor0->dims()[0];
+    int batch_size = tensor0.dims()[0];
    PADDLE_ENFORCE_LE(batch_size, max_batch_);
    // Convert input tensor from fluid to engine.
    for (const auto& x : context.Inputs("Xs")) {
      // convert input and copy to TRT engine's buffer
-      auto* v = context.scope().FindVar(x);
+      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
-      PADDLE_ENFORCE_NOT_NULL(v, "no variable called %s", x);
+          context.scope(), x);
-      auto& t = v->Get<framework::LoDTensor>();
      if (platform::is_cpu_place(t.place())) {
        engine_->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
                                 t.memory_size());
@@ -86,13 +88,18 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
      fluid_t->Resize(framework::make_ddim(ddim));
      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
      if (platform::is_cpu_place(fluid_t->place())) {
+        // TODO(Superjomn) change this float to dtype size.
        engine_->GetOutputInCPU(
-            y, fluid_t->mutable_data<float>(platform::CPUPlace()), size);
+            y, fluid_t->mutable_data<float>(platform::CPUPlace()),
+            size * sizeof(float));
      } else {
        engine_->GetOutputInGPU(
-            y, fluid_t->mutable_data<float>(platform::CUDAPlace()), size);
+            y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
+            size * sizeof(float));
      }
    }
+    cudaStreamSynchronize(stream_);
  }
 protected:
@@ -100,7 +107,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
  void Prepare(const framework::ExecutionContext& context) const;
 private:
-  mutable std::unique_ptr<inference::tensorrt::TensorRTEngine> engine_;
+  mutable cudaStream_t stream_;
+  mutable inference::tensorrt::TensorRTEngine* engine_{nullptr};
  mutable int max_batch_{0};
 };

--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+USE_CPU_ONLY_OP(tensorrt_engine);
+namespace paddle {
+namespace operators {
+namespace {
+void CreateCPUTensor(framework::Scope* scope, const std::string& name,
+                     const std::vector<int64_t>& shape) {
+  auto* var = scope->Var(name);
+  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  auto dims = framework::make_ddim(shape);
+  tensor->Resize(dims);
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  inference::tensorrt::RandomizeTensor(tensor, place, ctx);
+}
+void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
+                          const std::string& name,
+                          const std::vector<int64_t>& shape) {
+  using framework::proto::VarType;
+  auto* var = block->add_vars();
+  framework::VarDesc desc(name);
+  desc.SetType(VarType::LOD_TENSOR);
+  desc.SetDataType(VarType::FP32);
+  desc.SetShape(shape);
+  *var = *desc.Proto();
+}
+template <typename T>
+void SetAttr(framework::proto::OpDesc* op, const std::string& name,
+             const T& data);
+template <>
+void SetAttr<std::string>(framework::proto::OpDesc* op, const std::string& name,
+                          const std::string& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+  attr->set_s(data);
+}
+template <>
+void SetAttr<int>(framework::proto::OpDesc* op, const std::string& name,
+                  const int& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(data);
+}
+template <>
+void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
+                      const int64_t& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::LONG);
+  attr->set_l(data);
+}
+}  // namespace
+TEST(TensorRTEngineOp, manual) {
+  framework::ProgramDesc program;
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+  LOG(INFO) << "create mul op";
+  auto* mul = block_desc.AppendOp();
+  mul->SetType("mul");
+  mul->SetInput("X", std::vector<std::string>({"x"}));     // 2 x 4
+  mul->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
+  mul->SetOutput("Out", std::vector<std::string>({"z"}));  // 2 x 6
+  LOG(INFO) << "create fc op";
+  auto* fc = block_desc.AppendOp();
+  fc->SetType("mul");
+  fc->SetInput("X", std::vector<std::string>({"z"}));
+  fc->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
+  fc->SetOutput("Out", std::vector<std::string>({"z0"}));  // 2 x 8
+  // Set inputs' variable shape in BlockDesc
+  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4}));
+  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
+  AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
+  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
+  // It is wired, need to copy manually.
+  *block_->add_ops() = *mul->Proto();
+  *block_->add_ops() = *fc->Proto();
+  ASSERT_EQ(block_->ops_size(), 2);
+  LOG(INFO) << "create tensorrt desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("tensorrt_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x", "y", "y0"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
+  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
+                       block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 30);
+  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
+  LOG(INFO) << "create engine op";
+  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  // Prepare variables.
+  CreateCPUTensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  CreateCPUTensor(&scope, "y", std::vector<int64_t>({4, 6}));
+  CreateCPUTensor(&scope, "z", std::vector<int64_t>({2, 6}));
+  CreateCPUTensor(&scope, "y0", std::vector<int64_t>({6, 8}));
+  CreateCPUTensor(&scope, "z0", std::vector<int64_t>({2, 8}));
+  // Execute them.
+  LOG(INFO) << "engine_op run";
+  engine_op->Run(scope, place);
+}
+}  // namespace operators
+}  // namespace paddle
+USE_TRT_CONVERTER(mul)
+USE_TRT_CONVERTER(fc)
--- a/paddle/fluid/operators/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
@@ -87,9 +87,10 @@ TEST(SendNcclId, GrpcServer) {
  int port = g_rpc_service->GetSelectedPort();
  std::string ep = string::Sprintf("127.0.0.1:%d", port);
-  detail::RPCClient* client = detail::RPCClient::GetInstance();
+  detail::RPCClient* client =
-  LOG(INFO) << "connect to server " << ep;
+      detail::RPCClient::GetInstance<detail::GRPCClient>();
-  client->AsyncSendVariable(ep, dev_ctx, scope, NCCL_ID_VARNAME);
+  LOG(INFO) << "connect to server" << ep;
+  client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
  client->Wait();
  client->AsyncSendBatchBarrier(ep);
  client->Wait();

--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
-#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG)
+#if defined(__CUDA_ARCH__)
 #include <stdio.h>
 #define PADDLE_ASSERT(e)                                           \
  do {                                                             \
@@ -38,6 +38,9 @@ limitations under the License. */
  } while (0)
 #else
 #include <assert.h>
-#define PADDLE_ASSERT(e) assert(e)
+// For cuda, the assertions can affect performance and it is therefore
+// recommended to disable them in production code
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
+#define PADDLE_ASSERT(e) assert((e))
 #define PADDLE_ASSERT_MSG(e, m) assert((e) && (m))
 #endif
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -81,6 +81,27 @@ enum class PoolingMode {
  kMaximumDeterministic,
 };
+#if CUDNN_VERSION < 6000
+#pragma message "CUDNN version under 6.0 is supported at best effort."
+#pragma message "We strongly encourage you to move to 6.0 and above."
+#pragma message "This message is intended to annoy you enough to update."
+#pragma message \
+    "please see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/"
+inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
+  switch (mode) {
+    case PoolingMode::kMaximumDeterministic:
+      return CUDNN_POOLING_MAX;
+    case PoolingMode::kAverage:
+      return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    case PoolingMode::kMaximum:
+      return CUDNN_POOLING_MAX;
+    default:
+      PADDLE_THROW("Unexpected pooling mode.");
+  }
+}
+#else
 inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
  switch (mode) {
    case PoolingMode::kMaximumDeterministic:
@@ -93,6 +114,7 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
      PADDLE_THROW("Unexpected pooling mode.");
  }
 }
+#endif  // CUDNN_VERSION < 6000
 template <typename T>
 class CudnnDataType;

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -175,7 +175,6 @@ CUDADeviceContext::~CUDADeviceContext() {
 Place CUDADeviceContext::GetPlace() const { return place_; }
 void CUDADeviceContext::Wait() const {
-  std::lock_guard<std::recursive_mutex> guard(mutex_);
  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
  PADDLE_ENFORCE(cudaGetLastError());
 }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -100,7 +100,6 @@ class CUDADeviceContext : public DeviceContext {
  template <typename Callback>
  void RecordEvent(cudaEvent_t ev, Callback callback) {
-    std::lock_guard<std::recursive_mutex> guard(mutex_);
    callback();
    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
  }
@@ -110,8 +109,6 @@ class CUDADeviceContext : public DeviceContext {
  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
-  mutable std::recursive_mutex mutex_;
  cudaStream_t stream_;
  cudnnHandle_t cudnn_handle_;
  cublasHandle_t cublas_handle_;

--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -45,7 +45,7 @@ extern void *cublas_dso_handle;
      std::call_once(cublas_dso_flag, []() {                                 \
        cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
      });                                                                    \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);                  \
+      static void *p_##__name = dlsym(cublas_dso_handle, #__name);           \
      return reinterpret_cast<FUNC_TYPE>(p_##__name)(args...);               \
    }                                                                        \
  };                                                                         \

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -39,7 +39,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
        cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
      });                                                                  \
      EnforceCUDNNLoaded(#__name);                                         \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                 \
+      static void* p_##__name = dlsym(cudnn_dso_handle, #__name);          \
      return reinterpret_cast<cudnn_func>(p_##__name)(args...);            \
    }                                                                      \
  };                                                                       \

--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -45,7 +45,7 @@ extern void *cupti_dso_handle;
      std::call_once(cupti_dso_flag, []() {                                \
        cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
      });                                                                  \
-      void *p_##__name = dlsym(cupti_dso_handle, #__name);                 \
+      static void *p_##__name = dlsym(cupti_dso_handle, #__name);          \
      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);             \
    }                                                                      \
  };                                                                       \

--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -34,7 +34,7 @@ extern void *curand_dso_handle;
      std::call_once(curand_dso_flag, []() {                                 \
        curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
      });                                                                    \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);                  \
+      static void *p_##__name = dlsym(curand_dso_handle, #__name);           \
      return reinterpret_cast<curandFunc>(p_##__name)(args...);              \
    }                                                                        \
  };                                                                         \

--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -37,7 +37,7 @@ extern void* nccl_dso_handle;
      std::call_once(nccl_dso_flag, []() {                               \
        nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
      });                                                                \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);                \
+      static void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
      return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
    }                                                                    \
  };                                                                     \

--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -40,7 +40,7 @@ extern void* tensorrt_dso_handle;
            paddle::platform::dynload::GetTensorRtDsoHandle();          \
        PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \
      });                                                               \
-      void* p_##__name = dlsym(tensorrt_dso_handle, #__name);           \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);    \
      PADDLE_ENFORCE(p_##__name, "load %s failed", #__name);            \
      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);      \
    }                                                                   \

--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -40,7 +40,7 @@ extern void* warpctc_dso_handle;
      std::call_once(warpctc_dso_flag, []() {                                  \
        warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
      });                                                                      \
-      void* p_##_name = dlsym(warpctc_dso_handle, #__name);                    \
+      static void* p_##_name = dlsym(warpctc_dso_handle, #__name);             \
      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);                \
    }                                                                          \
  };                                                                           \

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -519,6 +519,14 @@ All parameter, weight, gradient are variables in Paddle.
          [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
          [](ExecutionStrategy &self, bool allow_op_delay) {
            self.allow_op_delay_ = allow_op_delay;
+          })
+      .def_property(
+          "num_iteration_per_drop_scope",
+          [](const ExecutionStrategy &self) {
+            return self.num_iteration_per_drop_scope_;
+          },
+          [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
+            self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
          });
  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy");
@@ -545,6 +553,12 @@ All parameter, weight, gradient are variables in Paddle.
          [](BuildStrategy &self,
             BuildStrategy::GradientScaleStrategy strategy) {
            self.gradient_scale_ = strategy;
+          })
+      .def_property(
+          "debug_graphviz_path",
+          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
+          [](BuildStrategy &self, const std::string &path) {
+            self.debug_graphviz_path_ = path;
          });
  pe.def(py::init<const std::vector<platform::Place> &,

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -447,7 +447,7 @@ EOF
    # run paddle version to install python packages first
    RUN apt-get update &&\
        ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \
+        apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
        pip install /*.whl; apt-get install -f -y && \
        apt-get clean -y && \
        rm -f /*.whl && \

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1182,19 +1182,19 @@ def conv2d(input,
        - Input:
-          Input shape: $(N, C_{in}, H_{in}, W_{in})$
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-          Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
        - Output:
-          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
        Where
        .. math::
-        H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-        W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
    Args:
       input(Variable): The input image with [N, C, H, W] format.

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -363,6 +363,40 @@ def zeros(shape, dtype, force_cpu=False):
    return fill_constant(value=0.0, **locals())
+def reverse(x, axis):
+    """
+    **reverse**
+    This function reverse the input 'x' along given axises.
+    Args:
+        x(Vairbale): the input to be reversed.
+        axis(int|tuple|list): Axis that along which order of elements 
+                    is reversed. If it is a tuple or a list, reversing 
+                    will be apply on each axis in the tuple or list.  
+    Returns:
+        Variable: The reversed tensor.
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.reverse(x=in, axis=0)
+          # or:
+          out = fluid.layers.reverse(x=in, axis=[0,1])
+    """
+    if isinstance(axis, int):
+        axis = [axis]
+    helper = LayerHelper("reverse", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='reverse',
+        inputs={'Input': x},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
 def save(x, file_path, overwrite=True):
    """
    Saves a variable as a file.

--- a/python/paddle/fluid/tests/test_concurrency.py
+++ b/python/paddle/fluid/tests/test_concurrency.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -43,12 +43,10 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
+# TODO(wuyi): this test hungs on CI, will add it back later
+list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
 foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
 py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
-# FIXME(Yancey1989): this test would cost much more time on CUDAPlace
-# since load cudnn libraries, so we use a longer timeout to make this
-# unit test stability.
-set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 30)
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -252,5 +252,25 @@ class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
        self.axis = 1
+class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+    def init_axis(self):
+        self.axis = -1
+class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 10, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+    def init_axis(self):
+        self.axis = -1
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -22,8 +22,8 @@ class TestMulOp(OpTest):
    def setUp(self):
        self.op_type = "mul"
        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
+            'X': np.random.random((2, 5)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
+            'Y': np.random.random((5, 3)).astype("float32")
        }
        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
@@ -46,13 +46,16 @@ class TestMulOp2(OpTest):
    def setUp(self):
        self.op_type = "mul"
        self.inputs = {
-            'X': np.random.random((15, 4, 12, 10)).astype("float32"),
+            'X': np.random.random((3, 4, 4, 3)).astype("float32"),
-            'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32")
+            'Y': np.random.random((2, 6, 1, 2, 3)).astype("float32")
        }
-        self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2}
+        self.attrs = {
-        result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10),
+            'x_num_col_dims': 2,
-                        self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9))
+            'y_num_col_dims': 2,
-        result = result.reshape(15, 4, 8, 2, 9)
+        }
+        result = np.dot(self.inputs['X'].reshape(3 * 4, 4 * 3),
+                        self.inputs['Y'].reshape(2 * 6, 1 * 2 * 3))
+        result = result.reshape(3, 4, 1, 2, 3)
        self.outputs = {'Out': result}
    def test_check_output(self):
@@ -73,9 +76,9 @@ class TestMulOp2(OpTest):
 class TestFP16MulOp1(OpTest):
    def setUp(self):
        self.op_type = "mul"
-        x = np.random.random((32, 84)).astype("float16")
+        x = np.random.random((3, 5)).astype("float16")
-        y = np.random.random((84, 100)).astype("float16")
+        y = np.random.random((5, 4)).astype("float16")
-        self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
+        self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)}
        self.outputs = {'Out': np.dot(x, y)}
    def test_check_output(self):
@@ -88,13 +91,15 @@ class TestFP16MulOp1(OpTest):
 class TestFP16MulOp2(OpTest):
    def setUp(self):
        self.op_type = "mul"
-        x = np.random.random((15, 4, 12, 10)).astype("float16")
+        x = np.random.random((3, 4, 4, 3)).astype("float16")
-        y = np.random.random((4, 30, 8, 2, 9)).astype("float16")
+        y = np.random.random((2, 6, 1, 2, 3)).astype("float16")
-        self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
+        self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)}
-        self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2}
+        self.attrs = {
-        result = np.dot(
+            'x_num_col_dims': 2,
-            x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9))
+            'y_num_col_dims': 2,
-        result = result.reshape(15, 4, 8, 2, 9)
+        }
+        result = np.dot(x.reshape(3 * 4, 4 * 3), y.reshape(2 * 6, 1 * 2 * 3))
+        result = result.reshape(3, 4, 1, 2, 3)
        self.outputs = {'Out': result}
    def test_check_output(self):

--- a/python/paddle/fluid/tests/unittests/test_reverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestReverseOp(OpTest):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [0]
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = "reverse"
+        self.inputs = {"X": self.x}
+        self.attrs = {'axis': self.axis}
+        out = self.x
+        for a in self.axis:
+            out = np.flip(out, axis=a)
+        self.outputs = {'Out': out}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+class TestCase0(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [1]
+class TestCase1(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [0, 1]
+class TestCase2(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4, 5)).astype('float32')
+        self.axis = [0, 2]
+class TestCase3(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4, 5)).astype('float32')
+        self.axis = [1, 2]
+if __name__ == '__main__':
+    unittest.main()