From 735eba29760d8b6f58e0374401a78b64a76c3158 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Sun, 24 Dec 2017 14:03:55 +0800
Subject: [PATCH] Feature/operator run place (#6783)

* "change operator interface"

* "move devicepool to device_context"

* "fix operator test"

* "fix op_registry Run interface"

* "net op passed. Need to fix nccl multi-Context"

* "add nccl group function"

* "add nccl group function"

* "fix gpu count exceed 32 error"

* "fix recurrent op, nccl op"

* "change the other operators interface with Place"

* "fix typo"

* "fix pybind"

* "fix device in python side"

* "fix pybind failed"

* "add init for test"

* "fix CI"
---
 doc/design/block.md                           |  4 +-
 paddle/framework/CMakeLists.txt               |  4 +-
 paddle/framework/executor.cc                  | 11 +--
 paddle/framework/executor.h                   | 92 +------------------
 paddle/framework/init.cc                      |  9 +-
 paddle/framework/init_test.cc                 |  4 +
 paddle/framework/op_registry_test.cc          | 18 ++--
 paddle/framework/operator.cc                  | 12 ++-
 paddle/framework/operator.h                   |  9 +-
 paddle/framework/operator_test.cc             | 39 ++++----
 paddle/operators/array_operator.h             |  8 +-
 paddle/operators/array_to_lod_tensor_op.cc    |  8 +-
 paddle/operators/assign_op.cc                 |  7 +-
 paddle/operators/beam_search_decode_op.cc     |  6 +-
 paddle/operators/beam_search_op.h             |  2 +-
 paddle/operators/cond_op.cc                   | 11 ++-
 paddle/operators/cond_op.h                    |  2 +-
 paddle/operators/conditional_block_op.cc      | 16 ++--
 paddle/operators/feed_op.cc                   |  9 +-
 paddle/operators/fetch_op.cc                  |  6 +-
 paddle/operators/fill_constant_op.cc          |  8 +-
 paddle/operators/fill_op.cc                   | 14 +--
 paddle/operators/increment_op.cc              |  2 +-
 paddle/operators/is_empty_op.cc               |  2 +-
 paddle/operators/load_op.cc                   | 10 +-
 paddle/operators/lod_array_length_op.cc       |  2 +-
 paddle/operators/lod_rank_table_op.cc         |  2 +-
 paddle/operators/lod_tensor_to_array_op.cc    |  7 +-
 paddle/operators/max_sequence_len_op.cc       |  2 +-
 paddle/operators/merge_lod_tensor_op.cc       |  6 +-
 paddle/operators/nccl_op.cc                   |  2 +-
 paddle/operators/nccl_op_test.cu.cc           | 24 +++--
 paddle/operators/net_op.h                     |  4 +-
 paddle/operators/net_op_test.cc               |  3 +-
 paddle/operators/recurrent_op.cc              | 38 +++++---
 paddle/operators/recv_op.cc                   |  8 +-
 .../reorder_lod_tensor_by_rank_op.cc          | 23 ++---
 paddle/operators/rnn_memory_helper_op.cc      |  6 +-
 paddle/operators/save_load_op_test.cc         |  6 +-
 paddle/operators/save_op.cc                   |  8 +-
 paddle/operators/shrink_rnn_memory_op.cc      | 10 +-
 paddle/operators/split_lod_tensor_op.cc       |  6 +-
 .../operators/tensor_array_read_write_op.cc   | 19 ++--
 paddle/operators/while_op.cc                  | 13 +--
 paddle/platform/CMakeLists.txt                |  2 +-
 paddle/platform/device_context.cc             | 53 +++++++++++
 paddle/platform/device_context.h              | 55 ++++++++++-
 ...context_test.cc => device_context_test.cu} | 55 ++++++++++-
 paddle/platform/dynload/nccl.h                |  2 +
 paddle/platform/enforce.h                     |  1 +
 paddle/platform/nccl_test.cu                  | 25 ++++-
 paddle/platform/place.h                       |  2 +
 paddle/pybind/pybind.cc                       | 10 +-
 paddle/pybind/tensor_py.h                     |  7 +-
 paddle/testing/CMakeLists.txt                 |  3 +-
 paddle/testing/paddle_gtest_main.cc           |  5 +
 python/paddle/v2/fluid/__init__.py            |  5 +
 python/paddle/v2/fluid/executor.py            |  5 +-
 python/paddle/v2/fluid/tests/op_test.py       |  4 +-
 .../paddle/v2/fluid/tests/test_adagrad_op.py  |  3 +-
 .../v2/fluid/tests/test_batch_norm_op.py      |  5 +-
 .../fluid/tests/test_beam_search_decode_op.py |  3 +-
 .../v2/fluid/tests/test_beam_search_op.py     |  3 +-
 python/paddle/v2/fluid/tests/test_cond_op.py  |  3 +-
 .../v2/fluid/tests/test_gaussian_random_op.py |  1 -
 .../paddle/v2/fluid/tests/test_is_empty_op.py |  3 +-
 python/paddle/v2/fluid/tests/test_sgd_op.py   |  3 +-
 .../v2/fluid/tests/test_uniform_random_op.py  |  1 -
 68 files changed, 468 insertions(+), 293 deletions(-)
 rename paddle/platform/{device_context_test.cc => device_context_test.cu} (58%)

diff --git a/doc/design/block.md b/doc/design/block.md
index 4066122c0e8..fab7f2dc481 100644
--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -291,10 +291,10 @@ public:
   }
 
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& place) const override {
     PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
     for (auto& op : runtime_table_.ops()) {
-      op->Run(scope, dev_ctx);
+      op->Run(scope, place);
     }
   }
 
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 206e298eb27..be9c01fb04f 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -30,7 +30,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@@ -59,5 +59,5 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 
-cc_library(init SRCS init.cc DEPS gflags executor place stringpiece)
+cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
 cc_test(init_test SRCS init_test.cc DEPS init)
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 14ae37ec49c..997773c1689 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -33,13 +33,7 @@ namespace framework {
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";
 
-DeviceContextPool* DeviceContextPool::pool = nullptr;
-
-Executor::Executor(const std::vector<platform::Place>& places) {
-  DeviceContextPool& pool = DeviceContextPool::Get();
-  auto borrowed_contexts = pool.Borrow(places);
-  device_contexts_.swap(borrowed_contexts);
-}
+Executor::Executor(const platform::Place& place) : place_(place) {}
 
 static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
   if (var_type == proto::VarDesc::LOD_TENSOR) {
@@ -71,7 +65,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
   //    - will change to use multiple blocks for RNN op and Cond Op
   PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
   auto& block = pdesc.Block(block_id);
-  auto& device = device_contexts_[0];
 
   Scope* local_scope = scope;
   if (create_vars) {
@@ -107,7 +100,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
   for (auto& op_desc : block.AllOps()) {
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
     VLOG(3) << op->DebugString();
-    op->Run(*local_scope, *device);
+    op->Run(*local_scope, place_);
   }
   if (create_local_scope) {
     scope->DeleteScope(local_scope);
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index a3d1609293a..d869e18901b 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -14,9 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <map>
-#include <unordered_map>
-
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/scope.h"
@@ -26,96 +23,13 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class DeviceContextPool {
- public:
-  static DeviceContextPool& Get() {
-    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
-    return *pool;
-  }
-
-  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
-    if (pool == nullptr) {
-      pool = new DeviceContextPool(places);
-    }
-    return *pool;
-  }
-
-  const platform::DeviceContext* Borrow(const platform::Place& place) {
-    auto range = device_contexts_.equal_range(place);
-    if (range.first == range.second) {
-      PADDLE_THROW(
-          "'Place' is not supported, Please re-compile with WITH_GPU "
-          "option");
-    }
-    return range.first->second;
-  }
-
-  std::vector<const platform::DeviceContext*> Borrow(
-      const std::vector<platform::Place>& places) {
-    PADDLE_ENFORCE_GT(places.size(), 0);
-    PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
-    std::vector<const platform::DeviceContext*> borrowed_contexts;
-    for (auto& place : places) {
-      auto range = device_contexts_.equal_range(place);
-      if (range.first == range.second) {
-        PADDLE_THROW(
-            "'Place' is not supported, Please re-compile with WITH_GPU "
-            "option");
-      }
-      // TODO(dzhwinter) : assign the first found device. Will enhanced later.
-      // device load balancer maybe useful here.
-      borrowed_contexts.emplace_back(range.first->second);
-    }
-    return borrowed_contexts;
-  }
-
-  explicit DeviceContextPool(const std::vector<platform::Place>& places) {
-    PADDLE_ENFORCE_GT(places.size(), 0);
-    for (size_t i = 0; i < places.size(); i++) {
-      if (platform::is_cpu_place(places[i])) {
-        device_contexts_.emplace(
-            places[i], new platform::CPUDeviceContext(
-                           boost::get<platform::CPUPlace>(places[i])));
-      } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_CUDA
-        device_contexts_.emplace(
-            places[i], new platform::CUDADeviceContext(
-                           boost::get<platform::GPUPlace>(places[i])));
-#else
-        PADDLE_THROW(
-            "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
-            "option");
-#endif
-      }
-    }
-  }
-
-  ~DeviceContextPool() {}
-
- private:
-  static DeviceContextPool* pool;
-  struct Hash {
-    std::hash<int> hash_;
-    size_t operator()(const platform::Place& place) const {
-      return hash_(place.which());
-    }
-  };
-  std::unordered_multimap<const platform::Place, const platform::DeviceContext*,
-                          Hash>
-      device_contexts_;
-  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
-};
-
 class Executor {
  public:
   // TODO(dzhwinter) : Do not rely on this function, it will be removed
   explicit Executor(const platform::DeviceContext& device)
-      : Executor(std::vector<platform::Place>({device.GetPlace()})) {}
-
-  explicit Executor(const platform::Place& place)
-      : Executor(std::vector<platform::Place>({place})) {}
+      : Executor(device.GetPlace()) {}
 
-  explicit Executor(const std::vector<platform::Place>& places);
+  explicit Executor(const platform::Place& place);
 
   /* @Brief
    * Runtime evaluation of the given ProgramDesc under certain Scope
@@ -128,7 +42,7 @@ class Executor {
            bool create_vars = true);
 
  private:
-  std::vector<const platform::DeviceContext*> device_contexts_;
+  const platform::Place place_;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
index 1c4476f4b30..4deb4fa903d 100644
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -14,8 +14,8 @@
 #include <algorithm>
 #include <string>
 
-#include "paddle/framework/executor.h"
 #include "paddle/framework/init.h"
+#include "paddle/platform/device_context.h"
 #include "paddle/platform/place.h"
 #include "paddle/string/piece.h"
 
@@ -48,7 +48,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
   std::vector<platform::Place> places;
   for (auto &device : devices) {
     auto p = string::Piece(device);
-    if (string::Find(p, ':', 0) == string::Piece::npos) {
+    if (string::HasPrefix(p, "CPU")) {
       places.emplace_back(platform::CPUPlace());
     } else if (string::HasPrefix(p, "GPU")) {
 #ifdef PADDLE_WITH_CUDA
@@ -69,10 +69,9 @@ bool InitDevices(const std::vector<std::string> &devices) {
                      return platform::is_cpu_place(place);
                    }) == places.end()) {
     places.emplace_back(platform::CPUPlace());
-    LOG(WARNING) << "Not specified any device, use CPU by Default.";
+    LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
   }
-  DeviceContextPool::Create(places);
-  return true;
+  platform::DeviceContextPool::Create(places);
   return true;
 }
 
diff --git a/paddle/framework/init_test.cc b/paddle/framework/init_test.cc
index f65e881a761..cb1ba7ce8fd 100644
--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
@@ -23,5 +23,9 @@ TEST(Init, InitDevices) {
 #ifdef PADDLE_WITH_CUDA
   std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"};
   ASSERT_EQ(InitDevices(ds2), true);
+
+  // test re-init
+  std::vector<std::string> ds3 = {"GPU:0", "GPU:1"};
+  ASSERT_EQ(InitDevices(ds3), true);
 #endif
 }
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 27713e5cbff..4cdf6e0865e 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -8,8 +8,7 @@ namespace framework {
 class CosineOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
 };
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -28,8 +27,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 class MyTestOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
 };
 
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -76,8 +74,8 @@ TEST(OpRegistry, CreateOp) {
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::framework::Scope scope;
-  paddle::platform::CPUDeviceContext dev_ctx;
-  op->Run(scope, dev_ctx);
+  paddle::platform::CPUPlace cpu_place;
+  op->Run(scope, cpu_place);
   float scale_get = op->Attr<float>("scale");
   ASSERT_EQ(scale_get, scale);
 }
@@ -117,8 +115,8 @@ TEST(OpRegistry, DefaultValue) {
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::framework::Scope scope;
-  paddle::platform::CPUDeviceContext dev_ctx;
-  op->Run(scope, dev_ctx);
+  paddle::platform::CPUPlace cpu_place;
+  op->Run(scope, cpu_place);
   ASSERT_EQ(op->Attr<float>("scale"), 1.0);
 }
 
@@ -167,9 +165,9 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_type(paddle::framework::proto::AttrType::INT);
   attr->set_i(4);
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  paddle::platform::CPUDeviceContext dev_ctx;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
-  op->Run(scope, dev_ctx);
+  op->Run(scope, cpu_place);
   int test_attr = op->Attr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
 }
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 0e58c0b5707..5d38ef5bebc 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/operator.h"
 #include <algorithm>
 #include <atomic>
+
+#include "paddle/framework/executor.h"
 #include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/operator.h"
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/var_type.h"
 
@@ -388,11 +390,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
 };
 
 void OperatorWithKernel::Run(const Scope& scope,
-                             const platform::DeviceContext& dev_ctx) const {
+                             const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
   this->InferShape(&infer_shape_ctx);
-
-  ExecutionContext ctx(*this, scope, dev_ctx);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
+  auto dev_ctx = pool.Borrow(place);
 
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
@@ -404,6 +406,8 @@ void OperatorWithKernel::Run(const Scope& scope,
 
   // check if op[type] have kernel for kernel_key
   OpKernelMap& kernels = kernels_iter->second;
+
+  ExecutionContext ctx(*this, scope, *dev_ctx);
   auto kernel_key = GetKernelType(ctx);
   auto kernel_iter = kernels.find(kernel_key);
 
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 3207360cbac..ef750aff1ba 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -83,8 +83,7 @@ class OperatorBase {
   virtual std::string DebugString() const;
 
   /// Net will call this function to Run an op.
-  virtual void Run(const Scope& scope,
-                   const platform::DeviceContext& dev_ctx) const = 0;
+  virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
 
   virtual bool IsNetOp() const { return false; }
 
@@ -159,8 +158,7 @@ class OperatorBase {
 class NOP : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
   std::unique_ptr<OperatorBase> Clone() const override {
     return std::unique_ptr<OperatorBase>(new NOP(*this));
   }
@@ -383,8 +381,7 @@ class OperatorWithKernel : public OperatorBase {
                      const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const final;
+  void Run(const Scope& scope, const platform::Place& place) const final;
 
   static std::unordered_map<std::string /* op_type */, OpKernelMap>&
   AllOpKernels() {
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 05a46515220..fbca45b59dc 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -11,11 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#include "paddle/framework/operator.h"
 #include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace framework {
@@ -27,8 +28,7 @@ class OpWithoutKernelTest : public OperatorBase {
   OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
                       const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs), x(1) {}
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+  void Run(const Scope& scope, const platform::Place& place) const override {
     ++op_run_num;
     ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
     ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
@@ -41,10 +41,9 @@ class OpWithoutKernelTest : public OperatorBase {
   int x{0};
 };
 
-class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
-                                           OpAttrChecker* op_checker)
+  OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("input", "input of test op");
     AddOutput("output", "output of test op");
@@ -65,11 +64,12 @@ static void BuildVar(const std::string& param_name,
   }
 }
 
-REGISTER_OP_WITHOUT_GRADIENT(
-    test_operator, paddle::framework::OpWithoutKernelTest,
-    paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
+REGISTER_OP_WITHOUT_GRADIENT(test_operator,
+                             paddle::framework::OpWithoutKernelTest,
+                             paddle::framework::OpWithoutKernelCheckerMaker);
 
 TEST(OperatorBase, all) {
+  paddle::framework::InitDevices({"CPU"});
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("test_operator");
   BuildVar("input", {"IN1"}, op_desc.add_inputs());
@@ -80,13 +80,13 @@ TEST(OperatorBase, all) {
   attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
-  paddle::platform::CPUDeviceContext device_context;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   scope.Var("OUT1");
   ASSERT_EQ(paddle::framework::op_run_num, 0);
-  op->Run(scope, device_context);
+  op->Run(scope, cpu_place);
   ASSERT_EQ(paddle::framework::op_run_num, 1);
 }
 
@@ -123,7 +123,6 @@ template <typename T1, typename T2>
 class CPUKernelTest : public OpKernel<float> {
  public:
   void Compute(const ExecutionContext& ctx) const {
-    std::cout << "this is cpu kernel" << std::endl;
     std::cout << ctx.op().DebugString() << std::endl;
     cpu_kernel_run_num++;
     ASSERT_EQ(ctx.op().Input("x"), "IN1");
@@ -195,6 +194,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
 
 // test with single input
 TEST(OpKernel, all) {
+  paddle::framework::InitDevices({"CPU"});
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("op_with_kernel");
   BuildVar("x", {"IN1"}, op_desc.add_inputs());
@@ -205,12 +205,12 @@ TEST(OpKernel, all) {
   attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
-  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
-  op->Run(scope, cpu_device_context);
+  op->Run(scope, cpu_place);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
 }
 
@@ -224,7 +224,9 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 TEST(OpKernel, multi_inputs) {
   using namespace paddle::framework;
 
+  paddle::framework::InitDevices({"CPU"});
   proto::OpDesc op_desc;
+
   op_desc.set_type("op_multi_inputs_with_kernel");
   BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
   BuildVar("k", {"k0"}, op_desc.add_inputs());
@@ -235,7 +237,7 @@ TEST(OpKernel, multi_inputs) {
   attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
-  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
   scope.Var("x0")->GetMutable<LoDTensor>();
   scope.Var("x1")->GetMutable<LoDTensor>();
@@ -245,7 +247,7 @@ TEST(OpKernel, multi_inputs) {
   scope.Var("y1")->GetMutable<LoDTensor>();
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  op->Run(scope, cpu_device_context);
+  op->Run(scope, cpu_place);
 }
 
 class OperatorClone : public paddle::framework::OperatorBase {
@@ -257,10 +259,11 @@ class OperatorClone : public paddle::framework::OperatorBase {
                 const paddle::framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const paddle::framework::Scope& scope,
-           const paddle::platform::DeviceContext& dev_ctx) const override {}
+           const paddle::platform::Place& place) const override {}
 };
 
 TEST(Operator, Clone) {
+  paddle::framework::InitDevices({"CPU"});
   OperatorClone a("ABC", paddle::framework::VariableNameMap{},
                   paddle::framework::VariableNameMap{},
                   paddle::framework::AttributeMap{});
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
index 1f2b4fdb4b4..d641918c56d 100644
--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@@ -15,6 +15,7 @@
 #pragma once
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -27,11 +28,16 @@ class ArrayOp : public framework::OperatorBase {
 
  protected:
   size_t GetOffset(const framework::Scope &scope,
-                   const platform::DeviceContext &dev_ctx) const {
+                   const platform::Place &place) const {
     auto *i = scope.FindVar(Input("I"));
     PADDLE_ENFORCE(i != nullptr, "I must be set");
     auto &i_tensor = i->Get<framework::LoDTensor>();
     PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
     size_t offset;
     if (platform::is_gpu_place(i_tensor.place())) {
       // FIXME: Avoid copy from GPU to CPU
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
index b6ca3cad944..73796229bcf 100644
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -12,10 +12,12 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 #include <numeric>
+
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -30,7 +32,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
                      const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
     auto &rank_table =
         scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
@@ -103,6 +105,10 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
           continue;
         }
         auto slice = out->Slice(out_offset, out_offset + len);
+
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        auto &dev_ctx = *pool.Borrow(place);
+
         framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
                             dev_ctx, &slice);
         out_offset += len;
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
index a914ff4ba92..60a913947ff 100644
--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/var_type.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -71,7 +72,7 @@ class AssignOp : public framework::OperatorBase {
            const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     if (x == nullptr) {
       return;
@@ -80,6 +81,10 @@ class AssignOp : public framework::OperatorBase {
     PADDLE_ENFORCE(
         out != nullptr,
         "The Output(Out) should not be null if the Input(X) is set.");
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
     framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
   }
 };
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc
index 32756faac53..52c28e7f532 100644
--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/beam_search_decode_op.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -55,7 +56,10 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
                      const framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& dev_place) const override {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
+    auto& dev_ctx = *pool.Borrow(dev_place);
+
     framework::ExecutionContext ctx(*this, scope, dev_ctx);
 
     const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
diff --git a/paddle/operators/beam_search_op.h b/paddle/operators/beam_search_op.h
index cc556bfe42a..08b551ef9bd 100644
--- a/paddle/operators/beam_search_op.h
+++ b/paddle/operators/beam_search_op.h
@@ -189,7 +189,7 @@ class BeamSearchOp : public framework::OperatorBase {
   }
 
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& dev_place) const override {
     LOG(INFO) << "run beam search op";
     auto ids_var = scope.FindVar(Input("ids"));
     auto scores_var = scope.FindVar(Input("scores"));
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index 8c860676e06..455fbd8ca3f 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/cond_op.h"
-
 #include "paddle/operators/gather.h"
 #include "paddle/operators/scatter.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -193,12 +193,15 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
   }
 }
 
-void CondOp::Run(const Scope& scope,
-                 const platform::DeviceContext& dev_ctx) const {
+void CondOp::Run(const Scope& scope, const platform::Place& place) const {
+  // get device context from pool
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
+  auto& dev_ctx = *pool.Borrow(place);
+
   PrepareDataForSubnet(scope, dev_ctx);
   std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
   for (int i = 0; i < BRANCH_NUM; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+    sub_net_op_[i]->Run(*sub_scopes[i], place);
   }
   MergeDataFromSubnet(scope, dev_ctx);
 }
diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
index 93121fb31be..7dcdc47e0b2 100644
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
@@ -78,7 +78,7 @@ class CondOp : public framework::OperatorBase {
   }
 
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
+           const platform::Place& place) const override;
 
  private:
   const int TRUE_BRANCH = 0;
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc
index 204be7d1e53..d8fd6420daf 100644
--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
@@ -51,7 +51,7 @@ class ConditionalBlockOp : public ConditionalOp {
                      const framework::AttributeMap &attrs)
       : ConditionalOp(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto xs = InputTensors(scope);
     bool need_run = std::all_of(
         xs.begin(), xs.end(),
@@ -65,8 +65,8 @@ class ConditionalBlockOp : public ConditionalOp {
       scopes->front() = &scope.NewScope();
       auto &cur_scope = *scopes->front();
 
+      framework::Executor exec(dev_place);
       auto *block = Attr<framework::BlockDesc *>("sub_block");
-      framework::Executor exec(dev_ctx);
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
     }
   }
@@ -104,7 +104,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
                          const framework::AttributeMap &attrs)
       : ConditionalOp(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto xs = this->InputTensors(scope);
     bool need_run = std::all_of(
         xs.begin(), xs.end(),
@@ -116,21 +116,21 @@ class ConditionalBlockGradOp : public ConditionalOp {
       auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
       framework::Scope &cur_scope = *scopes[0];
 
+      framework::Executor exec(dev_place);
       auto *block = Attr<framework::BlockDesc *>("sub_block");
-      framework::Executor exec(dev_ctx);
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
 
-      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("Params"),
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
                                   Outputs(framework::GradVarName("Params")));
 
-      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("X"),
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
                                   Outputs(framework::GradVarName("X")));
     }
   }
 
  private:
   void AssignLocalGradientToGlobal(
-      const platform::DeviceContext &dev_ctx, const framework::Scope &cur_scope,
+      const platform::Place &place, const framework::Scope &cur_scope,
       const std::vector<std::string> &p_names,
       const std::vector<std::string> &pg_names) const {
     for (size_t i = 0; i < p_names.size(); ++i) {
@@ -144,7 +144,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
       auto assign = framework::OpRegistry::CreateOp(
           "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}},
           framework::AttributeMap{});
-      assign->Run(cur_scope, dev_ctx);
+      assign->Run(cur_scope, place);
       cur_scope.Rename(new_in_grad_name, in_grad_name);
     }
   }
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index 66b8080c261..65c98a219b3 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -25,7 +25,7 @@ class FeedOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto feed_var_name = Input("X");
     auto *feed_var = scope.FindVar(feed_var_name);
 
@@ -47,7 +47,12 @@ class FeedOp : public framework::OperatorBase {
     auto &feed_list = feed_var->Get<framework::FeedFetchList>();
     auto &feed_item = feed_list.at(static_cast<size_t>(col));
     auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
-    framework::CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx, out_item);
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
+    framework::CopyFrom(feed_item, place, dev_ctx, out_item);
     out_item->set_lod(feed_item.lod());
   }
 };
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 616590f2001..21c34512bf4 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -26,7 +27,7 @@ class FetchOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto fetch_var_name = Input("X");
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE(fetch_var != nullptr,
@@ -51,6 +52,9 @@ class FetchOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): Should we assume the fetch operator always generate
     // CPU outputs?
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
     CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
     dev_ctx.Wait();
     dst_item.set_lod(src_item.lod());
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
index 3489079eaa3..fe0706c4a9d 100644
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -33,7 +34,7 @@ class FillConstantOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto data_type =
         static_cast<framework::proto::DataType>(Attr<int>("dtype"));
     auto value = Attr<float>("value");
@@ -45,8 +46,11 @@ class FillConstantOp : public framework::OperatorBase {
       auto cpu = platform::CPUPlace();
       out.mutable_data(cpu, framework::ToTypeIndex(data_type));
     } else {
-      out.mutable_data(dev_ctx.GetPlace(), framework::ToTypeIndex(data_type));
+      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
     }
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(dev_place);
     math::set_constant(dev_ctx, &out, value);
   }
 };
diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc
index f0c6cff8e34..9a2d8aafcae 100644
--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -42,7 +43,7 @@ class FillOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &out =
         detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
                                 "Cannot find variable %s", Output("Out"))
@@ -51,12 +52,11 @@ class FillOp : public framework::OperatorBase {
     auto dtype = static_cast<framework::proto::DataType>(Attr<int>("dtype"));
     platform::CPUPlace cpu;
     auto force_cpu = Attr<bool>("force_cpu");
-    out.mutable_data(force_cpu ? cpu : dev_ctx.GetPlace(),
-                     framework::ToTypeIndex(dtype));
+    out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype));
 
     framework::LoDTensor tensor;
 
-    if (force_cpu || platform::is_cpu_place(dev_ctx.GetPlace())) {
+    if (force_cpu || platform::is_cpu_place(place)) {
       tensor.ShareDataWith(out);
     } else {
       // Always make tensor in CPU memory.
@@ -67,9 +67,11 @@ class FillOp : public framework::OperatorBase {
     framework::VisitDataType(
         dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
 
-    if (!force_cpu && platform::is_gpu_place(dev_ctx.GetPlace())) {
+    if (!force_cpu && platform::is_gpu_place(place)) {
       // Copy tensor to out
-      framework::CopyFrom(tensor, dev_ctx.GetPlace(), dev_ctx, &out);
+      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      auto &dev_ctx = *pool.Borrow(place);
+      framework::CopyFrom(tensor, place, dev_ctx, &out);
     }
   }
 };
diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc
index 789c92102d6..3988ac12c74 100644
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@@ -52,7 +52,7 @@ class IncrementOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &out =
         *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
diff --git a/paddle/operators/is_empty_op.cc b/paddle/operators/is_empty_op.cc
index 3616a0414f9..545f87d4ed9 100644
--- a/paddle/operators/is_empty_op.cc
+++ b/paddle/operators/is_empty_op.cc
@@ -29,7 +29,7 @@ class IsEmptyOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     // get input
     auto *var = scope.FindVar(Input(kInput));
     PADDLE_ENFORCE_NOT_NULL(var);
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index 6c51dad27a4..ae6515bb12a 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -11,10 +11,10 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
+#include <fstream>
 
 #include "paddle/framework/op_registry.h"
-
-#include <fstream>
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -26,7 +26,7 @@ class LoadOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     std::ifstream fin(filename);
     PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
@@ -40,7 +40,9 @@ class LoadOp : public framework::OperatorBase {
     auto *tensor = out_var->GetMutable<framework::LoDTensor>();
     framework::DeserializeFromStream(fin, tensor);
 
-    auto place = dev_ctx.GetPlace();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
     if (platform::is_gpu_place(place)) {
       // copy CPU to GPU
       framework::LoDTensor cpu_tensor;
diff --git a/paddle/operators/lod_array_length_op.cc b/paddle/operators/lod_array_length_op.cc
index cc8593810ba..d71cb028bca 100644
--- a/paddle/operators/lod_array_length_op.cc
+++ b/paddle/operators/lod_array_length_op.cc
@@ -26,7 +26,7 @@ class LoDArrayLengthOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
     auto &out =
         *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc
index 2d67046bfee..c351ad8fef7 100644
--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -24,7 +24,7 @@ class LoDRankTableOp : public framework::OperatorBase {
                  const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
index 643f8859f3d..c7b9057f8da 100644
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -32,7 +33,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
                      const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
                           Input("X"))
                   .Get<framework::LoDTensor>();
@@ -86,6 +87,10 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
         // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
         auto slice = out[i].Slice(static_cast<int>(offset),
                                   static_cast<int>(offset + len));
+
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        auto &dev_ctx = *pool.Borrow(place);
+
         framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
                                     static_cast<int>(each_range.end)),
                             x.place(), dev_ctx, &slice);
diff --git a/paddle/operators/max_sequence_len_op.cc b/paddle/operators/max_sequence_len_op.cc
index dec2874a1fd..8d629fe7355 100644
--- a/paddle/operators/max_sequence_len_op.cc
+++ b/paddle/operators/max_sequence_len_op.cc
@@ -28,7 +28,7 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto &rank_table =
         scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
     auto *out =
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
index 5edf29c3af9..2287f347910 100644
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -28,7 +28,11 @@ class MergeLoDTensorOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(dev_place);
+
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
     auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index e19f534f8a2..368d2bfaa10 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -24,7 +24,7 @@ class NCCLInitOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     const auto &name = Output("Communicator");
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
                             "Can not find variable '%s' in the scope.", name);
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
index c1046aadafb..b6e4ccb73f8 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -22,6 +22,7 @@
 #include <vector>
 
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/init.h"
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/program_desc.h"
@@ -49,7 +50,7 @@ const f::DDim kDims = {100, 100};
 class NCCLTester : public ::testing::Test {
  public:
   virtual void SetUp() override {
-    cpu_ctx = new p::CPUDeviceContext(p::CPUPlace());
+    paddle::platform::CPUPlace cpu_place;
     for (size_t i = 0; i < gpu_list.size(); ++i) {
       p::GPUPlace place(i);
       dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
@@ -65,6 +66,7 @@ class NCCLTester : public ::testing::Test {
   }
 
   void NCCLInitOp() {
+    paddle::platform::CPUPlace cpu_place;
     std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
 
     op1->SetType("ncclInit");
@@ -76,7 +78,7 @@ class NCCLTester : public ::testing::Test {
 
     auto op = f::OpRegistry::CreateOp(*op1);
     VLOG(1) << "invoke NCCLInitOp.";
-    op->Run(g_scope, *cpu_ctx);
+    op->Run(g_scope, cpu_place);
     VLOG(1) << "NCCLInitOp finished.";
   }
 
@@ -111,13 +113,12 @@ class NCCLTester : public ::testing::Test {
     VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
     VLOG(1) << " send_tensor : " << send_tensor->numel()
             << " recv_tensor : " << recv_tensor->numel();
-    op->Run(*scope, *ctx);
+    op->Run(*scope, place);
     VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
   }
 
  public:
   std::vector<p::DeviceContext *> dev_ctxs;
-  p::DeviceContext *cpu_ctx;
   f::Scope g_scope;
   std::mutex mu;
 };
@@ -131,14 +132,14 @@ TEST(NCCL, ncclInitOp) {
   op_desc->SetAttr("gpus", {gpu_list});
 
   f::Scope g_scope;
-  std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace()));
+  paddle::platform::CPUPlace cpu_place;
 
   auto *var = g_scope.Var("x1");
   var->GetMutable<p::Communicator>();
 
   auto op = f::OpRegistry::CreateOp(*op_desc);
   VLOG(1) << "invoke NCCLInitOp.";
-  op->Run(g_scope, *ctx.get());
+  op->Run(g_scope, cpu_place);
   VLOG(1) << "NCCLInitOp finished.";
 }
 
@@ -294,9 +295,18 @@ int main(int argc, char **argv) {
     return 0;
   }
 
-  for (int i = 0; i < dev_count; ++i) {
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::GPUPlace(i));
     gpu_list.emplace_back(i);
   }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Create(places);
+
   testing::InitGoogleTest(&argc, argv);
 
   // device context should be release before scope.
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
index 8935751f15c..85d0153b32c 100644
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -65,9 +65,9 @@ class NetOp : public framework::OperatorBase {
    * will be used.
    */
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& place) const override {
     for (auto& op : ops_) {
-      op->Run(scope, dev_ctx);
+      op->Run(scope, place);
     }
   }
 
diff --git a/paddle/operators/net_op_test.cc b/paddle/operators/net_op_test.cc
index 22fba9568d0..dfd86546e83 100644
--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -13,8 +13,7 @@ class TestOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
   DEFINE_OP_CLONE_METHOD(TestOp);
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+  void Run(const Scope& scope, const platform::Place& place) const override {
     ++run_cnt;
   }
 };
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 5981d5745d2..77f3a40b763 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -227,14 +227,15 @@ class RecurrentOp : public RecurrentBase {
       : RecurrentBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
     VLOG(3) << "Static RNN input sequence length = " << seq_len;
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
-    framework::Executor executor(dev_ctx);
+    framework::Executor executor(place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
     auto *program = block->Program();
 
     for (size_t i = 0; i < seq_len; ++i) {
@@ -270,6 +271,10 @@ class RecurrentOp : public RecurrentBase {
       executor.Run(*program, &cur_scope, block->ID(),
                    false /*create_local_scope*/);
 
+      // get device context from pool
+      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      auto &dev_ctx = *pool.Borrow(place);
+
       // Copy inside::output -> outside::output
       //    outside::output[seq_offset: seq_offset + 1] = inside::output
       this->LinkTensorWithCallback(
@@ -278,14 +283,13 @@ class RecurrentOp : public RecurrentBase {
               framework::LoDTensor *dst_tensor) {
             if (i == 0) {  // create output tensor at begin
               dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims()));
-              dst_tensor->mutable_data(dev_ctx.GetPlace(), src_tensor.type());
+              dst_tensor->mutable_data(place, src_tensor.type());
             }
 
             auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
             // Explicit copy output since the local RNN scope can be destroyed
             // early.
-            framework::CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx,
-                                &dst_out);
+            framework::CopyFrom(src_tensor, place, dev_ctx, &dst_out);
           });
 
       scopes.Next();
@@ -311,15 +315,20 @@ class RecurrentGradOp : public RecurrentBase {
       : RecurrentBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto seq_len = static_cast<size_t>(GetSequenceLength(scope));
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
-    framework::Executor executor(dev_ctx);
+    framework::Executor executor(place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
     auto *program = block->Program();
 
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
     for (size_t step_id = 0; step_id < seq_len; ++step_id) {
       size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
       VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
@@ -366,8 +375,7 @@ class RecurrentGradOp : public RecurrentBase {
           auto *cur_grad_var = cur_scope.Var(cur_grad);
           auto cur_grad_tensor =
               cur_grad_var->GetMutable<framework::LoDTensor>();
-          framework::CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx,
-                              cur_grad_tensor);
+          framework::CopyFrom(ex_tensor, place, dev_ctx, cur_grad_tensor);
         }
       }
 
@@ -410,7 +418,7 @@ class RecurrentGradOp : public RecurrentBase {
             auto zero_op = framework::OpRegistry::CreateOp(
                 "fill_constant", framework::VariableNameMap{},
                 {{"Out", {pg_names[param_id]}}}, attrs);
-            zero_op->Run(scope, dev_ctx);
+            zero_op->Run(scope, place);
           }
 
           auto new_inside_name = cur_scope.Rename(inside_grad_name);
@@ -419,7 +427,7 @@ class RecurrentGradOp : public RecurrentBase {
           auto sum_op = framework::OpRegistry::CreateOp(
               "sum", {{"X", {pg_names[param_id], new_inside_name}}},
               {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
-          sum_op->Run(cur_scope, dev_ctx);
+          sum_op->Run(cur_scope, place);
 
           cur_scope.Rename(new_inside_name, inside_grad_name);
         }
@@ -437,11 +445,11 @@ class RecurrentGradOp : public RecurrentBase {
             }
             if (step_id == 0) {  // alloc memory
               outside->Resize(PrependDims(seq_len, inside.dims()));
-              outside->mutable_data(dev_ctx.GetPlace(), inside.type());
+              outside->mutable_data(place, inside.type());
             }
 
             auto dst = outside->Slice(seq_offset, seq_offset + 1);
-            framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, &dst);
+            framework::CopyFrom(inside, place, dev_ctx, &dst);
           });
       VLOG(5) << "Link outside gradient finished ";
 
@@ -453,8 +461,8 @@ class RecurrentGradOp : public RecurrentBase {
             [&](const framework::LoDTensor &inside,
                 framework::LoDTensor *outside) {
               outside->Resize(inside.dims());
-              outside->mutable_data(dev_ctx.GetPlace(), inside.type());
-              framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, outside);
+              outside->mutable_data(place, inside.type());
+              framework::CopyFrom(inside, place, dev_ctx, outside);
             });
         VLOG(5) << "Link initialize state gradient finished ";
       }
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
index 4e91d1151eb..89196f27a39 100644
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -73,7 +73,7 @@ class RecvOp : public framework::OperatorBase {
   }
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     // FIXME(typhoonzero): no new scopes for every run.
     framework::Scope &recv_scope = scope.NewScope();
     rpc_service_->SetScope(&recv_scope);
@@ -113,7 +113,9 @@ class RecvOp : public framework::OperatorBase {
         auto *var = recv_scope.Var(grad_var_name);
         auto *tensor = var->GetMutable<framework::LoDTensor>();
         // FIXME(typhoonzero): do not copy
-        framework::CopyFrom(v.second, dev_ctx.GetPlace(), dev_ctx, tensor);
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        auto &dev_ctx = *pool.Borrow(place);
+        framework::CopyFrom(v.second, place, dev_ctx, tensor);
       }
       rpc_service_->Reset();
 
@@ -121,7 +123,7 @@ class RecvOp : public framework::OperatorBase {
       framework::proto::ProgramDesc program_desc;
       program_desc.ParseFromString(program_str);
       framework::ProgramDesc program(program_desc);
-      framework::Executor executor(dev_ctx);
+      framework::Executor executor(place);
       // Run sub graph to get optimized tensor
       try {
         executor.Run(program, &recv_scope, 0, /*global_block*/
diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
index 5e3079ee0c9..09d3ccc356b 100644
--- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
@@ -12,9 +12,10 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <paddle/framework/lod_rank_table.h>
+#include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -53,7 +54,7 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
                                   const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &x =
         detail::Ref(scope.FindVar(Input("X")),
                     "Cannot find input lod tensor variable %s", Input("X"))
@@ -69,11 +70,11 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
 
     out.Resize(x.dims());
     out.mutable_data(x.place(), x.type());
-    this->process(dev_ctx, x, rank_table, &out);
+    this->process(place, x, rank_table, &out);
   }
 
  protected:
-  virtual void process(const platform::DeviceContext &dev_ctx,
+  virtual void process(const platform::Place &place,
                        const framework::LoDTensor &x,
                        const framework::LoDRankTable &rank_table,
                        framework::LoDTensor *out) const = 0;
@@ -104,7 +105,7 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
     return absolute_table;
   }
 
-  size_t CopyTensorAndLod(const platform::DeviceContext &dev_ctx,
+  size_t CopyTensorAndLod(const platform::Place &place,
                           const AbsoluteRankTableItem &item,
                           const framework::LoDTensor &x,
                           framework::LoDTensor *out, size_t out_offset) const {
@@ -130,6 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
     auto x_sliced = x.Slice(x_offset, x_offset + len);
     auto out_sliced = out->Slice(out_offset, out_offset + len);
 
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
     framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
     out_offset += len;
     return out_offset;
@@ -145,8 +148,7 @@ class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
       : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
 
  protected:
-  void process(const platform::DeviceContext &dev_ctx,
-               const framework::LoDTensor &x,
+  void process(const platform::Place &place, const framework::LoDTensor &x,
                const framework::LoDRankTable &rank_table,
                framework::LoDTensor *out) const override {
     auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
@@ -154,7 +156,7 @@ class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
     out->mutable_lod()->clear();
     for (auto &item : rank_table.items()) {
       PADDLE_ENFORCE_LT(item.index, absolute_table.size());
-      out_offset = CopyTensorAndLod(dev_ctx, absolute_table[item.index], x, out,
+      out_offset = CopyTensorAndLod(place, absolute_table[item.index], x, out,
                                     out_offset);
     }
   }
@@ -192,8 +194,7 @@ class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase {
       : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
 
  protected:
-  void process(const platform::DeviceContext &dev_ctx,
-               const framework::LoDTensor &x,
+  void process(const platform::Place &place, const framework::LoDTensor &x,
                const framework::LoDRankTable &rank_table,
                framework::LoDTensor *out) const override {
     auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
@@ -214,7 +215,7 @@ class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase {
     // Copy TensorAndLod
     size_t out_offset = 0;
     for (auto &offset : offsets) {
-      out_offset = this->CopyTensorAndLod(dev_ctx, absolute_table[offset.first],
+      out_offset = this->CopyTensorAndLod(place, absolute_table[offset.first],
                                           x, out, out_offset);
     }
   }
diff --git a/paddle/operators/rnn_memory_helper_op.cc b/paddle/operators/rnn_memory_helper_op.cc
index 795bdf3e51a..edd475ec397 100644
--- a/paddle/operators/rnn_memory_helper_op.cc
+++ b/paddle/operators/rnn_memory_helper_op.cc
@@ -25,7 +25,7 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
                     const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto mem_var_name = Input("X");
     auto *mem_var = scope.FindVar(mem_var_name);
     PADDLE_ENFORCE(mem_var != nullptr,
@@ -77,7 +77,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
                         const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto out_grad_var_name = Input(framework::GradVarName("Out"));
     auto *out_grad_var = scope.FindVar(out_grad_var_name);
 
@@ -100,7 +100,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
 
       auto zero_op = framework::OpRegistry::CreateOp(
           "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs);
-      zero_op->Run(scope, dev_ctx);
+      zero_op->Run(scope, dev_place);
     } else {
       auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
       auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/operators/save_load_op_test.cc b/paddle/operators/save_load_op_test.cc
index a57466a48d4..6606613d738 100644
--- a/paddle/operators/save_load_op_test.cc
+++ b/paddle/operators/save_load_op_test.cc
@@ -21,7 +21,7 @@ USE_NO_KERNEL_OP(load);
 TEST(SaveLoadOp, CPU) {
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace place;
-  paddle::platform::CPUDeviceContext ctx(place);
+
   auto var = scope.Var("test_var");
   auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
   tensor->Resize({10, 10});
@@ -42,13 +42,13 @@ TEST(SaveLoadOp, CPU) {
 
   auto save_op = paddle::framework::OpRegistry::CreateOp(
       "save", {{"X", {"test_var"}}}, {}, attrs);
-  save_op->Run(scope, ctx);
+  save_op->Run(scope, place);
 
   auto load_var = scope.Var("out_var");
   auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
   auto load_op = paddle::framework::OpRegistry::CreateOp(
       "load", {}, {{"Out", {"out_var"}}}, attrs);
-  load_op->Run(scope, ctx);
+  load_op->Run(scope, place);
   int* actual = target->data<int>();
   for (int64_t i = 0; i < tensor->numel(); ++i) {
     EXPECT_EQ(expect[i], actual[i]);
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
index eae1146d6c6..f763b8d6bf8 100644
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -21,6 +21,7 @@
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -62,7 +63,7 @@ class SaveOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
 
@@ -88,6 +89,11 @@ class SaveOp : public framework::OperatorBase {
                    "SaveOp only support LoDTensor, %s has wrong type", iname);
 
     auto &tensor = var->Get<framework::LoDTensor>();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
     framework::SerializeToStream(fout, tensor, dev_ctx);
   }
 };
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
index 48194a547bb..3ee6bd190d8 100644
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -27,11 +27,11 @@ class ShrinkRNNMemoryOp : public ArrayOp {
       : ArrayOp(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x_var = scope.FindVar(Input("X"));
     PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
     auto &x_tensor = x_var->Get<framework::LoDTensor>();
-    size_t offset = this->GetOffset(scope, dev_ctx);
+    size_t offset = this->GetOffset(scope, place);
     auto *rank_table_var = scope.FindVar(Input("RankTable"));
     PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set");
     auto &rank_table = rank_table_var->Get<framework::LoDRankTable>();
@@ -93,7 +93,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
       : ArrayOp(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
     auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
     PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
@@ -105,6 +105,10 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
     dx_tensor.Resize(x_tensor.dims());
     dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
 
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
     if (dout_var == nullptr) {  // dx_tensor fill zero
       math::set_constant(dev_ctx, &dx_tensor, 0.0f);
     } else {
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
index 3542d8624fe..89826ca6ee9 100644
--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -33,7 +34,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
     auto *out_true =
@@ -44,6 +45,9 @@ class SplitLoDTensorOp : public framework::OperatorBase {
     auto &x_lod = x.lod();
     auto &mask_dim = mask.dims();
 
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(dev_place);
+
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index 90cbc19d1b1..2ee9bf700c2 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -25,11 +25,11 @@ class WriteToArrayOp : public ArrayOp {
       : ArrayOp(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     if (x == nullptr) return;
     auto &x_tensor = x->Get<framework::LoDTensor>();
-    size_t offset = GetOffset(scope, dev_ctx);
+    size_t offset = GetOffset(scope, place);
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
     if (offset >= out->size()) {
@@ -39,7 +39,11 @@ class WriteToArrayOp : public ArrayOp {
     }
     if (x_tensor.memory_size() > 0) {
       auto *out_tensor = &out->at(offset);
-      CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor);
+
+      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      auto &dev_ctx = *pool.Borrow(place);
+
+      CopyFrom(x_tensor, place, dev_ctx, out_tensor);
       out_tensor->set_lod(x_tensor.lod());
     } else {
       VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
@@ -119,17 +123,18 @@ class ReadFromArrayOp : public ArrayOp {
                   const framework::AttributeMap &attrs)
       : ArrayOp(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     PADDLE_ENFORCE(x != nullptr, "X must be set");
     auto &x_array = x->Get<framework::LoDTensorArray>();
     auto *out = scope.FindVar(Output("Out"));
     PADDLE_ENFORCE(out != nullptr, "Out must be set");
     auto *out_tensor = out->GetMutable<framework::LoDTensor>();
-    size_t offset = GetOffset(scope, dev_ctx);
+    size_t offset = GetOffset(scope, place);
     if (offset < x_array.size()) {
-      framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx,
-                          out_tensor);
+      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      auto &dev_ctx = *pool.Borrow(place);
+      framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor);
       out_tensor->set_lod(x_array[offset].lod());
     } else {
       VLOG(10) << "offset " << offset << " >= " << x_array.size();
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index 324c8b98c48..11ee96faad5 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -40,13 +40,14 @@ class WhileOp : public framework::OperatorBase {
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
     auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
     PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
 
-    framework::Executor executor(dev_ctx);
+    framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
     auto *program = block->Program();
 
     auto step_scopes =
@@ -97,8 +98,8 @@ class WhileGradOp : public framework::OperatorBase {
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
-    framework::Executor executor(dev_ctx);
+           const platform::Place &dev_place) const override {
+    framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
 
@@ -189,7 +190,7 @@ class WhileGradOp : public framework::OperatorBase {
             auto zero_op = framework::OpRegistry::CreateOp(
                 "fill_constant", framework::VariableNameMap{},
                 {{"Out", {pg_names[param_id]}}}, attrs);
-            zero_op->Run(scope, dev_ctx);
+            zero_op->Run(scope, dev_place);
           }
         }
 
@@ -197,7 +198,7 @@ class WhileGradOp : public framework::OperatorBase {
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {pg_names[param_id], new_inside_name}}},
             {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
-        sum_op->Run(cur_scope, dev_ctx);
+        sum_op->Run(cur_scope, dev_place);
         cur_scope.Rename(new_inside_name, inside_grad_name);
       }
     }
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 88df28a9668..f0a0ea70a0a 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -25,7 +25,7 @@ ENDIF()
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
     system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
-nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
+nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index dacee74fff3..a28e9de716c 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -15,6 +15,59 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+DeviceContextPool* DeviceContextPool::pool = nullptr;
+
+const platform::DeviceContext* DeviceContextPool::Borrow(
+    const platform::Place& place) {
+  auto it = device_contexts_.find(place);
+  if (it == device_contexts_.end()) {
+    PADDLE_THROW(
+        "'Place' is not supported, Please re-compile with WITH_GPU "
+        "option");
+  }
+  return it->second;
+}
+
+std::vector<const platform::DeviceContext*> DeviceContextPool::Borrow(
+    const std::vector<platform::Place>& places) {
+  PADDLE_ENFORCE_GT(places.size(), 0);
+  PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
+  std::vector<const platform::DeviceContext*> borrowed_contexts;
+  for (auto& place : places) {
+    auto it = device_contexts_.find(place);
+    if (it != device_contexts_.end()) {
+      borrowed_contexts.emplace_back(it->second);
+    } else {
+      PADDLE_THROW(
+          "'Place' is not supported, Please re-compile with WITH_GPU "
+          "option");
+    }
+  }
+  return borrowed_contexts;
+}
+
+DeviceContextPool::DeviceContextPool(
+    const std::vector<platform::Place>& places) {
+  PADDLE_ENFORCE_GT(places.size(), 0);
+  for (size_t i = 0; i < places.size(); i++) {
+    if (platform::is_cpu_place(places[i])) {
+      device_contexts_.emplace(places[i],
+                               new platform::CPUDeviceContext(
+                                   boost::get<platform::CPUPlace>(places[i])));
+    } else if (platform::is_gpu_place(places[i])) {
+#ifdef PADDLE_WITH_CUDA
+      device_contexts_.emplace(places[i],
+                               new platform::CUDADeviceContext(
+                                   boost::get<platform::GPUPlace>(places[i])));
+#else
+      PADDLE_THROW(
+          "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
+          "option");
+#endif
+    }
+  }
+}
+
 CPUDeviceContext::CPUDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 6cc0508522a..1d46ce5c703 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -11,8 +11,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
+#include <memory>
+#include <unordered_map>
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/platform/dynload/cublas.h"
@@ -20,10 +20,13 @@ limitations under the License. */
 #include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
-#include <memory>
+
+#include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
+#include "glog/logging.h"
+
 namespace paddle {
 namespace platform {
 
@@ -105,5 +108,51 @@ class CUDNNDeviceContext : public CUDADeviceContext {
 
 #endif
 
+/*! \brief device context pool singleton */
+class DeviceContextPool {
+ public:
+  explicit DeviceContextPool(const std::vector<platform::Place>& places);
+
+  static DeviceContextPool& Get() {
+    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
+    return *pool;
+  }
+
+  /*! \brief  Create should only called by Init function */
+  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
+    if (pool == nullptr) {
+      pool = new DeviceContextPool(places);
+    }
+    return *pool;
+  }
+
+  /*! \brief  Return handle of single device context. */
+  const platform::DeviceContext* Borrow(const platform::Place& place);
+
+  /*! \brief  Return handle of multi-device context. */
+  std::vector<const platform::DeviceContext*> Borrow(
+      const std::vector<platform::Place>& places);
+
+  ~DeviceContextPool() {}
+
+ private:
+  static DeviceContextPool* pool;
+  struct Hash {
+    std::hash<int> hash_;
+    size_t operator()(const platform::Place& place) const {
+      int pre_hash = place.which()
+                     << (sizeof(int) * 8 - NUM_PLACE_TYPE_LIMIT_IN_BIT);
+      if (platform::is_gpu_place(place)) {
+        pre_hash += boost::get<platform::GPUPlace>(place).GetDeviceId();
+      }
+      return hash_(pre_hash);
+    }
+  };
+  std::unordered_map<const platform::Place, const platform::DeviceContext*,
+                     Hash>
+      device_contexts_;
+  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
+};
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cu
similarity index 58%
rename from paddle/platform/device_context_test.cc
rename to paddle/platform/device_context_test.cu
index 109c13a8812..f046c79e0a0 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cu
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/device_context.h"
 #include "gtest/gtest.h"
+#include "paddle/platform/device_context.h"
+
+#include "glog/logging.h"
 
 TEST(Device, Init) {
   using paddle::platform::DeviceContext;
@@ -62,3 +64,54 @@ TEST(Device, CUDNNDeviceContext) {
     }
   }
 }
+
+TEST(Device, DeviceContextPool) {
+  using paddle::platform::DeviceContextPool;
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::Place;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::GPUPlace;
+
+  DeviceContextPool& pool = DeviceContextPool::Get();
+  auto cpu_dev_ctx1 = pool.Borrow(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Borrow(CPUPlace());
+  EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1);
+
+  std::vector<Place> gpu_places;
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    gpu_places.emplace_back(GPUPlace(i));
+  }
+  auto dev_ctxs = pool.Borrow(gpu_places);
+  for (size_t i = 0; i < dev_ctxs.size(); ++i) {
+    auto* dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctxs[i]);
+
+    // check same as GPUPlace(i)
+    GPUPlace place = boost::get<GPUPlace>(dev_ctx->GetPlace());
+    EXPECT_EQ(place.GetDeviceId(), static_cast<int>(i));
+  }
+}
+
+int main(int argc, char** argv) {
+  int dev_count = paddle::platform::GetCUDADeviceCount();
+  if (dev_count <= 1) {
+    LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA "
+                    "device count is "
+                 << dev_count;
+    return 0;
+  }
+
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::GPUPlace(i));
+  }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Create(places);
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
index 11007c1031c..cb31e00b8e2 100644
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
@@ -63,6 +63,8 @@ extern void LoadNCCLDSO();
   __macro(ncclAllReduce);               \
   __macro(ncclBcast);                   \
   __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
   __macro(ncclReduce);                  \
   __macro(ncclGetErrorString);
 
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 5abd4d4a345..d1c7be0790b 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 
+#include "paddle/platform/macros.h"
 #include "paddle/string/printf.h"
 #include "paddle/string/to_string.h"
 
diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu
index 94ab360a196..6750c8da7db 100644
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -12,17 +12,19 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#include <thrust/device_vector.h>
+#include <memory>
+#include <vector>
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/dynload/nccl.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/gpu_info.h"
 
-#include <thrust/device_vector.h>
-#include <memory>
-#include <vector>
-
 static int dev_count = 0;
 
 namespace paddle {
@@ -31,7 +33,8 @@ namespace platform {
 TEST(NCCL, init) {
   std::vector<ncclComm_t> comms;
   comms.resize(dev_count);
-  dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
+  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
+
   for (int i = 0; i < dev_count; ++i) {
     dynload::ncclCommDestroy(comms[i]);
   }
@@ -131,6 +134,18 @@ int main(int argc, char** argv) {
         << dev_count;
     return 0;
   }
+
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::GPUPlace(i));
+  }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Create(places);
+
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index ca98920d414..6bff2d4d9cd 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -60,12 +60,14 @@ struct IsGPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const MKLDNNPlace &) const { return false; }
   bool operator()(const GPUPlace &gpu) const { return true; }
+  bool operator()(const CUDNNPlace &) const { return true; }
 };
 
 struct IsMKLDNNPlace : public boost::static_visitor<bool> {
   bool operator()(const MKLDNNPlace &) const { return true; }
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const GPUPlace &) const { return false; }
+  bool operator()(const CUDNNPlace &) const { return false; }
 };
 
 // Define the max number of Place in bit length. i.e., the max number of places
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 2d7fe251416..de6b24f70d8 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -360,10 +360,10 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
-              const platform::DeviceContext &dev_ctx) {
-             self.Run(scope, dev_ctx);
-             dev_ctx.Wait();
-           })
+              const platform::CPUPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::GPUPlace &place) { self.Run(scope, place); })
       .def("type",
            [](const OperatorBase &op) -> std::string { return op.Type(); })
       .def("outputs",
@@ -417,7 +417,7 @@ All parameter, weight, gradient are variables in Paddle.
            });
 
   py::class_<framework::Executor>(m, "Executor")
-      .def(py::init<std::vector<platform::Place> &>())
+      .def(py::init<const platform::Place &>())
       .def("run", &Executor::Run);
 
   m.def("unique_integer", UniqueIntegerGenerator);
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 268a0f2fa38..413fd9b046f 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -14,9 +14,9 @@
 
 #pragma once
 #include <string>
-#include "paddle/framework/executor.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
@@ -63,8 +63,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
             tensor.dims(), platform::CPUPlace()));
 
-        framework::DeviceContextPool &pool =
-            framework::DeviceContextPool::Get();
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
         auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
             pool.Borrow(tensor.place()));
 
@@ -138,7 +137,7 @@ void PyCUDATensorSetFromArray(
   self.Resize(framework::make_ddim(dims));
   auto *dst = self.mutable_data<T>(place);
 
-  framework::DeviceContextPool &pool = framework::DeviceContextPool::Get();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
   auto dev_ctx =
       static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place));
   paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 8132742749e..77f84cd43bd 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -6,7 +6,6 @@ if(WITH_TESTING)
   add_library(paddle_test_util STATIC TestUtil.cpp)
   add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
   if(NOT MOBILE_INFERENCE)
-    add_library(paddle_gtest_main STATIC paddle_gtest_main.cc)
-    add_dependencies(paddle_gtest_main paddle_memory gtest gflags)
+    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init paddle_memory gtest gflags)
   endif()
 endif()
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index a491322b7e5..7ba1bf095ab 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <cstring>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/init.h"
 #include "paddle/memory/memory.h"
 
 int main(int argc, char** argv) {
@@ -32,8 +34,11 @@ int main(int argc, char** argv) {
   google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
   testing::InitGoogleTest(&argc, argv);
   paddle::memory::Used(paddle::platform::CPUPlace());
+  std::vector<std::string> devs = {"CPU"};
 #ifdef PADDLE_WITH_CUDA
   paddle::memory::Used(paddle::platform::GPUPlace(0));
+  devs.push_back("GPU:0");
 #endif
+  paddle::framework::InitDevices(devs);
   return RUN_ALL_TESTS();
 }
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 471255ef50a..051b9094aaf 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -42,5 +42,10 @@ def __read_gflags_from_env__():
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
 
+    if core.is_compile_gpu():
+        core.init_devices(["CPU", "GPU:0"])
+    else:
+        core.init_devices(["CPU"])
+
 
 __read_gflags_from_env__()
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index 4b4a0820abb..cdd576294f4 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -47,13 +47,14 @@ class Executor(object):
             act_places.append(p)
 
         # TODO(dzhwinter) : consider that our fluid tests all written in 
-        # GPUPlace(gpu_id), this will be changed in next PR.
+        # GPUPlace(gpu_id), this will be changed in the future
         if core.is_compile_gpu():
             core.init_devices(["CPU", "GPU:0"])
         else:
             core.init_devices(["CPU"])
 
-        self.executor = core.Executor(act_places)
+        # TODO(dzhwinter) : only use the first place
+        self.executor = core.Executor(act_places[0])
         self.places = places
 
     def aslodtensor(self, data):
diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
index e83c4a06220..087283bfded 100644
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -90,12 +90,10 @@ def get_numeric_gradient(scope,
     def product(dim):
         return reduce(lambda a, b: a * b, dim, 1)
 
-    ctx = core.DeviceContext.create(core.CPUPlace())
-
     def get_output():
         sum = []
         for output_name in output_names:
-            op.run(scope, ctx)
+            op.run(scope, core.CPUPlace())
             sum.append(
                 np.array(scope.find_var(output_name).get_tensor()).mean())
         return np.array(sum).mean()
diff --git a/python/paddle/v2/fluid/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py
index 903e84c3288..1ff3932164b 100644
--- a/python/paddle/v2/fluid/tests/test_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py
@@ -113,8 +113,7 @@ class TestSparseAdagradOp(unittest.TestCase):
             LearningRate='LearningRate',
             epsilon=2.0)
 
-        ctx = core.DeviceContext.create(place)
-        adagrad_op.run(scope, ctx)
+        adagrad_op.run(scope, place)
 
         # get and compare moment result
         moment_result_array = np.array(moment)
diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
index a9c0b1cfd34..dfc047e1f0d 100644
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -296,8 +296,7 @@ class TestBatchNormOp(OpTest):
                 momentum=momentum,
                 epsilon=epsilon)
 
-            ctx = core.DeviceContext.create(place)
-            batch_norm_op.run(scope, ctx)
+            batch_norm_op.run(scope, place)
 
             # check forward result
             self.__assert_close(y_tensor, y_out, "y_out")
@@ -320,7 +319,7 @@ class TestBatchNormOp(OpTest):
                 ["y_out", "mean", "variance", "saved_mean", "saved_variance"],
                 place,
                 feed_dict={"y_out": y_grad})
-            batch_norm_op_grad.run(scope, ctx)
+            batch_norm_op_grad.run(scope, place)
 
             x_grad_tensor = create_or_get_tensor(scope,
                                                  grad_var_name("x_val"), None,
diff --git a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
index 5fad7d8cce5..f329214dce4 100644
--- a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
@@ -57,8 +57,7 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
             SentenceIds="sentence_ids",
             SentenceScores="sentence_scores")
 
-        ctx = core.DeviceContext.create(self.cpu_place)
-        beam_search_decode_op.run(self.scope, ctx)
+        beam_search_decode_op.run(self.scope, self.cpu_place)
 
         expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
         self.assertEqual(sentence_ids.lod(), expected_lod)
diff --git a/python/paddle/v2/fluid/tests/test_beam_search_op.py b/python/paddle/v2/fluid/tests/test_beam_search_op.py
index cc7c09bb59d..595f132fa85 100644
--- a/python/paddle/v2/fluid/tests/test_beam_search_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_op.py
@@ -14,7 +14,6 @@ def create_tensor(scope, name, np_data):
 class BeamSearchOpTester(unittest.TestCase):
     def setUp(self):
         self.scope = core.Scope()
-        self.ctx = core.DeviceContext.create(core.CPUPlace())
         self._create_ids()
         self._create_scores()
         self._create_pre_ids()
@@ -32,7 +31,7 @@ class BeamSearchOpTester(unittest.TestCase):
             level=0,
             beam_size=2,
             end_id=0, )
-        op.run(self.scope, self.ctx)
+        op.run(self.scope, core.CPUPlace())
         selected_ids = self.scope.find_var("selected_ids").get_tensor()
         print 'selected_ids', np.array(selected_ids)
         print 'lod', selected_ids.lod()
diff --git a/python/paddle/v2/fluid/tests/test_cond_op.py b/python/paddle/v2/fluid/tests/test_cond_op.py
index 9d1df44b906..32e54084e48 100644
--- a/python/paddle/v2/fluid/tests/test_cond_op.py
+++ b/python/paddle/v2/fluid/tests/test_cond_op.py
@@ -65,8 +65,7 @@ class TestCondOp(unittest.TestCase):
         self.create_global_variables()
         self.create_cond_op()
         self.create_sub_net()
-        ctx = core.DeviceContext.create(core.CPUPlace())
-        self.condop.run(self.scope, ctx)
+        self.condop.run(self.scope, core.CPUPlace())
         return np.array(self.scope.find_var("Out").get_tensor())
 
     def create_global_variables(self):
diff --git a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
index a9d943b8b7f..4afe0c6a6d3 100644
--- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
@@ -24,7 +24,6 @@ class TestGaussianRandomOp(unittest.TestCase):
 
     def gaussian_random_test(self, place):
 
-        context = core.DeviceContext.create(place)
         program = fluid.Program()
         block = program.global_block()
         vout = block.create_var(name="Out")
diff --git a/python/paddle/v2/fluid/tests/test_is_empty_op.py b/python/paddle/v2/fluid/tests/test_is_empty_op.py
index ed6e3fe24f6..0a4dd0f4faf 100644
--- a/python/paddle/v2/fluid/tests/test_is_empty_op.py
+++ b/python/paddle/v2/fluid/tests/test_is_empty_op.py
@@ -33,8 +33,7 @@ class TestIsEmptyOp(unittest.TestCase):
 
     def one_case(self, input, target):
         op = Operator(type="is_empty", X=input, Out="out")
-        ctx = core.DeviceContext.create(core.CPUPlace())
-        op.run(self.scope, ctx)
+        op.run(self.scope, core.CPUPlace())
         out = self.scope.var("out").get_tensor()
         self.assertEqual(np.array(out)[0], target)
 
diff --git a/python/paddle/v2/fluid/tests/test_sgd_op.py b/python/paddle/v2/fluid/tests/test_sgd_op.py
index ca05a381f06..9c345792bee 100644
--- a/python/paddle/v2/fluid/tests/test_sgd_op.py
+++ b/python/paddle/v2/fluid/tests/test_sgd_op.py
@@ -55,8 +55,7 @@ class TestSparseSGDOp(unittest.TestCase):
             Grad='Grad',
             ParamOut='Param',
             LearningRate='LearningRate')
-        ctx = core.DeviceContext.create(place)
-        sgd_op.run(scope, ctx)
+        sgd_op.run(scope, place)
 
         # get and compare result
         result_array = np.array(param)
diff --git a/python/paddle/v2/fluid/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
index 00b4f196209..d6872c8ba35 100644
--- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
@@ -26,7 +26,6 @@ class TestUniformRandomOp(unittest.TestCase):
             self.uniform_random_test(place=core.GPUPlace(0))
 
     def uniform_random_test(self, place):
-        context = core.DeviceContext.create(place)
         program = fluid.Program()
         block = program.global_block()
         vout = block.create_var(name="Out")
-- 
GitLab