diff --git a/doc/design/block.md b/doc/design/block.md
index 4066122c0e8dfa33776796c3d205ba5aec9e0f52..fab7f2dc481ae51aa982164dc5048d90fcdc2b0b 100644
--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -291,10 +291,10 @@ public:
   }
 
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& place) const override {
     PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
     for (auto& op : runtime_table_.ops()) {
-      op->Run(scope, dev_ctx);
+      op->Run(scope, place);
     }
   }
 
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 206e298eb27a2daaec5c674d45cfe4b81a6b522d..be9c01fb04f4428b5754c3d963b079ca347c45ee 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -30,7 +30,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@@ -59,5 +59,5 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 
-cc_library(init SRCS init.cc DEPS gflags executor place stringpiece)
+cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
 cc_test(init_test SRCS init_test.cc DEPS init)
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 14ae37ec49c12203381e74b3f9174a460e41c18e..997773c1689efad4ce5a86c09ce58bd3a40185e0 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -33,13 +33,7 @@ namespace framework {
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";
 
-DeviceContextPool* DeviceContextPool::pool = nullptr;
-
-Executor::Executor(const std::vector<platform::Place>& places) {
-  DeviceContextPool& pool = DeviceContextPool::Get();
-  auto borrowed_contexts = pool.Borrow(places);
-  device_contexts_.swap(borrowed_contexts);
-}
+Executor::Executor(const platform::Place& place) : place_(place) {}
 
 static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
   if (var_type == proto::VarDesc::LOD_TENSOR) {
@@ -71,7 +65,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
   //    - will change to use multiple blocks for RNN op and Cond Op
   PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
   auto& block = pdesc.Block(block_id);
-  auto& device = device_contexts_[0];
 
   Scope* local_scope = scope;
   if (create_vars) {
@@ -107,7 +100,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
   for (auto& op_desc : block.AllOps()) {
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
     VLOG(3) << op->DebugString();
-    op->Run(*local_scope, *device);
+    op->Run(*local_scope, place_);
   }
   if (create_local_scope) {
     scope->DeleteScope(local_scope);
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index a3d1609293a0d687c33447ca7a0df95c6aac3bc5..d869e18901b82959a40cc296aa0844c20ea63ac1 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -14,9 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <map>
-#include <unordered_map>
-
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/scope.h"
@@ -26,96 +23,13 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class DeviceContextPool {
- public:
-  static DeviceContextPool& Get() {
-    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
-    return *pool;
-  }
-
-  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
-    if (pool == nullptr) {
-      pool = new DeviceContextPool(places);
-    }
-    return *pool;
-  }
-
-  const platform::DeviceContext* Borrow(const platform::Place& place) {
-    auto range = device_contexts_.equal_range(place);
-    if (range.first == range.second) {
-      PADDLE_THROW(
-          "'Place' is not supported, Please re-compile with WITH_GPU "
-          "option");
-    }
-    return range.first->second;
-  }
-
-  std::vector<const platform::DeviceContext*> Borrow(
-      const std::vector<platform::Place>& places) {
-    PADDLE_ENFORCE_GT(places.size(), 0);
-    PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
-    std::vector<const platform::DeviceContext*> borrowed_contexts;
-    for (auto& place : places) {
-      auto range = device_contexts_.equal_range(place);
-      if (range.first == range.second) {
-        PADDLE_THROW(
-            "'Place' is not supported, Please re-compile with WITH_GPU "
-            "option");
-      }
-      // TODO(dzhwinter) : assign the first found device. Will enhanced later.
-      // device load balancer maybe useful here.
-      borrowed_contexts.emplace_back(range.first->second);
-    }
-    return borrowed_contexts;
-  }
-
-  explicit DeviceContextPool(const std::vector<platform::Place>& places) {
-    PADDLE_ENFORCE_GT(places.size(), 0);
-    for (size_t i = 0; i < places.size(); i++) {
-      if (platform::is_cpu_place(places[i])) {
-        device_contexts_.emplace(
-            places[i], new platform::CPUDeviceContext(
-                           boost::get<platform::CPUPlace>(places[i])));
-      } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_CUDA
-        device_contexts_.emplace(
-            places[i], new platform::CUDADeviceContext(
-                           boost::get<platform::GPUPlace>(places[i])));
-#else
-        PADDLE_THROW(
-            "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
-            "option");
-#endif
-      }
-    }
-  }
-
-  ~DeviceContextPool() {}
-
- private:
-  static DeviceContextPool* pool;
-  struct Hash {
-    std::hash<int> hash_;
-    size_t operator()(const platform::Place& place) const {
-      return hash_(place.which());
-    }
-  };
-  std::unordered_multimap<const platform::Place, const platform::DeviceContext*,
-                          Hash>
-      device_contexts_;
-  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
-};
-
 class Executor {
  public:
   // TODO(dzhwinter) : Do not rely on this function, it will be removed
   explicit Executor(const platform::DeviceContext& device)
-      : Executor(std::vector<platform::Place>({device.GetPlace()})) {}
-
-  explicit Executor(const platform::Place& place)
-      : Executor(std::vector<platform::Place>({place})) {}
+      : Executor(device.GetPlace()) {}
 
-  explicit Executor(const std::vector<platform::Place>& places);
+  explicit Executor(const platform::Place& place);
 
   /* @Brief
    * Runtime evaluation of the given ProgramDesc under certain Scope
@@ -128,7 +42,7 @@ class Executor {
            bool create_vars = true);
 
  private:
-  std::vector<const platform::DeviceContext*> device_contexts_;
+  const platform::Place place_;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
index 1c4476f4b30aebf094eb27b45fb435c24a9061c1..4deb4fa903dec04e9b76c5a620f1eb76c9f1db07 100644
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -14,8 +14,8 @@
 #include <algorithm>
 #include <string>
 
-#include "paddle/framework/executor.h"
 #include "paddle/framework/init.h"
+#include "paddle/platform/device_context.h"
 #include "paddle/platform/place.h"
 #include "paddle/string/piece.h"
 
@@ -48,7 +48,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
   std::vector<platform::Place> places;
   for (auto &device : devices) {
     auto p = string::Piece(device);
-    if (string::Find(p, ':', 0) == string::Piece::npos) {
+    if (string::HasPrefix(p, "CPU")) {
       places.emplace_back(platform::CPUPlace());
     } else if (string::HasPrefix(p, "GPU")) {
 #ifdef PADDLE_WITH_CUDA
@@ -69,10 +69,9 @@ bool InitDevices(const std::vector<std::string> &devices) {
                      return platform::is_cpu_place(place);
                    }) == places.end()) {
     places.emplace_back(platform::CPUPlace());
-    LOG(WARNING) << "Not specified any device, use CPU by Default.";
+    LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
   }
-  DeviceContextPool::Create(places);
-  return true;
+  platform::DeviceContextPool::Create(places);
   return true;
 }
 
diff --git a/paddle/framework/init_test.cc b/paddle/framework/init_test.cc
index f65e881a761e0a546d595eced26dd5b12475a763..cb1ba7ce8fdbf740846689356c94c4f2fabb95cb 100644
--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
@@ -23,5 +23,9 @@ TEST(Init, InitDevices) {
 #ifdef PADDLE_WITH_CUDA
   std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"};
   ASSERT_EQ(InitDevices(ds2), true);
+
+  // test re-init
+  std::vector<std::string> ds3 = {"GPU:0", "GPU:1"};
+  ASSERT_EQ(InitDevices(ds3), true);
 #endif
 }
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 27713e5cbffe95e0ae31ac94a70c64deb53c4ffb..4cdf6e0865e0922b72bd184172f85a9c705dcd00 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -8,8 +8,7 @@ namespace framework {
 class CosineOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
 };
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -28,8 +27,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 class MyTestOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
 };
 
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -76,8 +74,8 @@ TEST(OpRegistry, CreateOp) {
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::framework::Scope scope;
-  paddle::platform::CPUDeviceContext dev_ctx;
-  op->Run(scope, dev_ctx);
+  paddle::platform::CPUPlace cpu_place;
+  op->Run(scope, cpu_place);
   float scale_get = op->Attr<float>("scale");
   ASSERT_EQ(scale_get, scale);
 }
@@ -117,8 +115,8 @@ TEST(OpRegistry, DefaultValue) {
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::framework::Scope scope;
-  paddle::platform::CPUDeviceContext dev_ctx;
-  op->Run(scope, dev_ctx);
+  paddle::platform::CPUPlace cpu_place;
+  op->Run(scope, cpu_place);
   ASSERT_EQ(op->Attr<float>("scale"), 1.0);
 }
 
@@ -167,9 +165,9 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_type(paddle::framework::proto::AttrType::INT);
   attr->set_i(4);
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  paddle::platform::CPUDeviceContext dev_ctx;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
-  op->Run(scope, dev_ctx);
+  op->Run(scope, cpu_place);
   int test_attr = op->Attr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
 }
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 0e58c0b5707516bd1274181df568d08ff504c152..5d38ef5bebc33039e0391069ae87a974fff537af 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/operator.h"
 #include <algorithm>
 #include <atomic>
+
+#include "paddle/framework/executor.h"
 #include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/operator.h"
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/var_type.h"
 
@@ -388,11 +390,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
 };
 
 void OperatorWithKernel::Run(const Scope& scope,
-                             const platform::DeviceContext& dev_ctx) const {
+                             const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
   this->InferShape(&infer_shape_ctx);
-
-  ExecutionContext ctx(*this, scope, dev_ctx);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
+  auto dev_ctx = pool.Borrow(place);
 
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
@@ -404,6 +406,8 @@ void OperatorWithKernel::Run(const Scope& scope,
 
   // check if op[type] have kernel for kernel_key
   OpKernelMap& kernels = kernels_iter->second;
+
+  ExecutionContext ctx(*this, scope, *dev_ctx);
   auto kernel_key = GetKernelType(ctx);
   auto kernel_iter = kernels.find(kernel_key);
 
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 3207360cbaca4e3b96dfe933c67aaa70c59a6044..ef750aff1bae2540690ccc82b1105449344bf4ab 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -83,8 +83,7 @@ class OperatorBase {
   virtual std::string DebugString() const;
 
   /// Net will call this function to Run an op.
-  virtual void Run(const Scope& scope,
-                   const platform::DeviceContext& dev_ctx) const = 0;
+  virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
 
   virtual bool IsNetOp() const { return false; }
 
@@ -159,8 +158,7 @@ class OperatorBase {
 class NOP : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
   std::unique_ptr<OperatorBase> Clone() const override {
     return std::unique_ptr<OperatorBase>(new NOP(*this));
   }
@@ -383,8 +381,7 @@ class OperatorWithKernel : public OperatorBase {
                      const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const final;
+  void Run(const Scope& scope, const platform::Place& place) const final;
 
   static std::unordered_map<std::string /* op_type */, OpKernelMap>&
   AllOpKernels() {
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 05a465152204c8e9f9dbd75d0bfb21ea44d25cf1..fbca45b59dc5446e93e79599f471d80a06ea3661 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -11,11 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#include "paddle/framework/operator.h"
 #include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace framework {
@@ -27,8 +28,7 @@ class OpWithoutKernelTest : public OperatorBase {
   OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
                       const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs), x(1) {}
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+  void Run(const Scope& scope, const platform::Place& place) const override {
     ++op_run_num;
     ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
     ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
@@ -41,10 +41,9 @@ class OpWithoutKernelTest : public OperatorBase {
   int x{0};
 };
 
-class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
-                                           OpAttrChecker* op_checker)
+  OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("input", "input of test op");
     AddOutput("output", "output of test op");
@@ -65,11 +64,12 @@ static void BuildVar(const std::string& param_name,
   }
 }
 
-REGISTER_OP_WITHOUT_GRADIENT(
-    test_operator, paddle::framework::OpWithoutKernelTest,
-    paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
+REGISTER_OP_WITHOUT_GRADIENT(test_operator,
+                             paddle::framework::OpWithoutKernelTest,
+                             paddle::framework::OpWithoutKernelCheckerMaker);
 
 TEST(OperatorBase, all) {
+  paddle::framework::InitDevices({"CPU"});
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("test_operator");
   BuildVar("input", {"IN1"}, op_desc.add_inputs());
@@ -80,13 +80,13 @@ TEST(OperatorBase, all) {
   attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
-  paddle::platform::CPUDeviceContext device_context;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   scope.Var("OUT1");
   ASSERT_EQ(paddle::framework::op_run_num, 0);
-  op->Run(scope, device_context);
+  op->Run(scope, cpu_place);
   ASSERT_EQ(paddle::framework::op_run_num, 1);
 }
 
@@ -123,7 +123,6 @@ template <typename T1, typename T2>
 class CPUKernelTest : public OpKernel<float> {
  public:
   void Compute(const ExecutionContext& ctx) const {
-    std::cout << "this is cpu kernel" << std::endl;
     std::cout << ctx.op().DebugString() << std::endl;
     cpu_kernel_run_num++;
     ASSERT_EQ(ctx.op().Input("x"), "IN1");
@@ -195,6 +194,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
 
 // test with single input
 TEST(OpKernel, all) {
+  paddle::framework::InitDevices({"CPU"});
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("op_with_kernel");
   BuildVar("x", {"IN1"}, op_desc.add_inputs());
@@ -205,12 +205,12 @@ TEST(OpKernel, all) {
   attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
-  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
-  op->Run(scope, cpu_device_context);
+  op->Run(scope, cpu_place);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
 }
 
@@ -224,7 +224,9 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 TEST(OpKernel, multi_inputs) {
   using namespace paddle::framework;
 
+  paddle::framework::InitDevices({"CPU"});
   proto::OpDesc op_desc;
+
   op_desc.set_type("op_multi_inputs_with_kernel");
   BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
   BuildVar("k", {"k0"}, op_desc.add_inputs());
@@ -235,7 +237,7 @@ TEST(OpKernel, multi_inputs) {
   attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
-  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
   scope.Var("x0")->GetMutable<LoDTensor>();
   scope.Var("x1")->GetMutable<LoDTensor>();
@@ -245,7 +247,7 @@ TEST(OpKernel, multi_inputs) {
   scope.Var("y1")->GetMutable<LoDTensor>();
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  op->Run(scope, cpu_device_context);
+  op->Run(scope, cpu_place);
 }
 
 class OperatorClone : public paddle::framework::OperatorBase {
@@ -257,10 +259,11 @@ class OperatorClone : public paddle::framework::OperatorBase {
                 const paddle::framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const paddle::framework::Scope& scope,
-           const paddle::platform::DeviceContext& dev_ctx) const override {}
+           const paddle::platform::Place& place) const override {}
 };
 
 TEST(Operator, Clone) {
+  paddle::framework::InitDevices({"CPU"});
   OperatorClone a("ABC", paddle::framework::VariableNameMap{},
                   paddle::framework::VariableNameMap{},
                   paddle::framework::AttributeMap{});
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
index 1f2b4fdb4b4a99d5baf5de1cc226dc196ab4eb2e..d641918c56d7e81f0dc10331a662fe94637428f1 100644
--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@@ -15,6 +15,7 @@
 #pragma once
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -27,11 +28,16 @@ class ArrayOp : public framework::OperatorBase {
 
  protected:
   size_t GetOffset(const framework::Scope &scope,
-                   const platform::DeviceContext &dev_ctx) const {
+                   const platform::Place &place) const {
     auto *i = scope.FindVar(Input("I"));
     PADDLE_ENFORCE(i != nullptr, "I must be set");
     auto &i_tensor = i->Get<framework::LoDTensor>();
     PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
     size_t offset;
     if (platform::is_gpu_place(i_tensor.place())) {
       // FIXME: Avoid copy from GPU to CPU
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
index b6ca3cad94425207629160a4c7d715f685b23a09..73796229bcff3b4253994806504c01374b89ee9a 100644
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -12,10 +12,12 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 #include <numeric>
+
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -30,7 +32,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
                      const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
     auto &rank_table =
         scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
@@ -103,6 +105,10 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
           continue;
         }
         auto slice = out->Slice(out_offset, out_offset + len);
+
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        auto &dev_ctx = *pool.Borrow(place);
+
         framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
                             dev_ctx, &slice);
         out_offset += len;
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
index a914ff4ba92318c75326bd7945bb73bcb93b6fc3..60a913947ffd985bc51136294f90c823a9ead1e3 100644
--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/var_type.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -71,7 +72,7 @@ class AssignOp : public framework::OperatorBase {
            const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     if (x == nullptr) {
       return;
@@ -80,6 +81,10 @@ class AssignOp : public framework::OperatorBase {
     PADDLE_ENFORCE(
         out != nullptr,
         "The Output(Out) should not be null if the Input(X) is set.");
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
     framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
   }
 };
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc
index 32756faac5324cfb3b5366857d2c8176665fb3ec..52c28e7f532f9751589176c8d37362620167cf63 100644
--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/beam_search_decode_op.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -55,7 +56,10 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
                      const framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& dev_place) const override {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
+    auto& dev_ctx = *pool.Borrow(dev_place);
+
     framework::ExecutionContext ctx(*this, scope, dev_ctx);
 
     const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
diff --git a/paddle/operators/beam_search_op.h b/paddle/operators/beam_search_op.h
index cc556bfe42ab12d73c0eb503d033efc272b5dd68..08b551ef9bd63106ed222d3a956a912294f827ec 100644
--- a/paddle/operators/beam_search_op.h
+++ b/paddle/operators/beam_search_op.h
@@ -189,7 +189,7 @@ class BeamSearchOp : public framework::OperatorBase {
   }
 
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& dev_place) const override {
     LOG(INFO) << "run beam search op";
     auto ids_var = scope.FindVar(Input("ids"));
     auto scores_var = scope.FindVar(Input("scores"));
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index 8c860676e06de5dac9570d2a6f7271ff451eebee..455fbd8ca3f5083fac51776524daca6f6a029667 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/cond_op.h"
-
 #include "paddle/operators/gather.h"
 #include "paddle/operators/scatter.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -193,12 +193,15 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
   }
 }
 
-void CondOp::Run(const Scope& scope,
-                 const platform::DeviceContext& dev_ctx) const {
+void CondOp::Run(const Scope& scope, const platform::Place& place) const {
+  // get device context from pool
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
+  auto& dev_ctx = *pool.Borrow(place);
+
   PrepareDataForSubnet(scope, dev_ctx);
   std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
   for (int i = 0; i < BRANCH_NUM; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+    sub_net_op_[i]->Run(*sub_scopes[i], place);
   }
   MergeDataFromSubnet(scope, dev_ctx);
 }
diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
index 93121fb31be287794249b5a62386d5a8dd268a0c..7dcdc47e0b2ff216bea92d083fe5897009384d39 100644
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
@@ -78,7 +78,7 @@ class CondOp : public framework::OperatorBase {
   }
 
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
+           const platform::Place& place) const override;
 
  private:
   const int TRUE_BRANCH = 0;
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc
index 204be7d1e5385b7fdab54914bec216543e360cd3..d8fd6420daf440cb252b07a82786f1d129729c61 100644
--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
@@ -51,7 +51,7 @@ class ConditionalBlockOp : public ConditionalOp {
                      const framework::AttributeMap &attrs)
       : ConditionalOp(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto xs = InputTensors(scope);
     bool need_run = std::all_of(
         xs.begin(), xs.end(),
@@ -65,8 +65,8 @@ class ConditionalBlockOp : public ConditionalOp {
       scopes->front() = &scope.NewScope();
       auto &cur_scope = *scopes->front();
 
+      framework::Executor exec(dev_place);
       auto *block = Attr<framework::BlockDesc *>("sub_block");
-      framework::Executor exec(dev_ctx);
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
     }
   }
@@ -104,7 +104,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
                          const framework::AttributeMap &attrs)
       : ConditionalOp(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto xs = this->InputTensors(scope);
     bool need_run = std::all_of(
         xs.begin(), xs.end(),
@@ -116,21 +116,21 @@ class ConditionalBlockGradOp : public ConditionalOp {
       auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
       framework::Scope &cur_scope = *scopes[0];
 
+      framework::Executor exec(dev_place);
       auto *block = Attr<framework::BlockDesc *>("sub_block");
-      framework::Executor exec(dev_ctx);
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
 
-      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("Params"),
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
                                   Outputs(framework::GradVarName("Params")));
 
-      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("X"),
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
                                   Outputs(framework::GradVarName("X")));
     }
   }
 
  private:
   void AssignLocalGradientToGlobal(
-      const platform::DeviceContext &dev_ctx, const framework::Scope &cur_scope,
+      const platform::Place &place, const framework::Scope &cur_scope,
       const std::vector<std::string> &p_names,
       const std::vector<std::string> &pg_names) const {
     for (size_t i = 0; i < p_names.size(); ++i) {
@@ -144,7 +144,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
       auto assign = framework::OpRegistry::CreateOp(
           "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}},
           framework::AttributeMap{});
-      assign->Run(cur_scope, dev_ctx);
+      assign->Run(cur_scope, place);
       cur_scope.Rename(new_in_grad_name, in_grad_name);
     }
   }
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index 66b8080c26192a74cc27bce9a00107de89822717..65c98a219b378e8325ed8e054f79936fff386e27 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -25,7 +25,7 @@ class FeedOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto feed_var_name = Input("X");
     auto *feed_var = scope.FindVar(feed_var_name);
 
@@ -47,7 +47,12 @@ class FeedOp : public framework::OperatorBase {
     auto &feed_list = feed_var->Get<framework::FeedFetchList>();
     auto &feed_item = feed_list.at(static_cast<size_t>(col));
     auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
-    framework::CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx, out_item);
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
+    framework::CopyFrom(feed_item, place, dev_ctx, out_item);
     out_item->set_lod(feed_item.lod());
   }
 };
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 616590f2001be3bea4e50c0c1755a80eb20e9348..21c34512bf42053971d1169bd0e2294fcd773bef 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -26,7 +27,7 @@ class FetchOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto fetch_var_name = Input("X");
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE(fetch_var != nullptr,
@@ -51,6 +52,9 @@ class FetchOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): Should we assume the fetch operator always generate
     // CPU outputs?
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
     CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
     dev_ctx.Wait();
     dst_item.set_lod(src_item.lod());
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
index 3489079eaa3e8f04e27941de942ce9e14f8434f9..fe0706c4a9da864025737584b72c02cca83c956b 100644
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -33,7 +34,7 @@ class FillConstantOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto data_type =
         static_cast<framework::proto::DataType>(Attr<int>("dtype"));
     auto value = Attr<float>("value");
@@ -45,8 +46,11 @@ class FillConstantOp : public framework::OperatorBase {
       auto cpu = platform::CPUPlace();
       out.mutable_data(cpu, framework::ToTypeIndex(data_type));
     } else {
-      out.mutable_data(dev_ctx.GetPlace(), framework::ToTypeIndex(data_type));
+      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
     }
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(dev_place);
     math::set_constant(dev_ctx, &out, value);
   }
 };
diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc
index f0c6cff8e34c9038c2321c0326bd2ef728d665ba..9a2d8aafcaea6327d1ea95ffb0f232059ef0a473 100644
--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -42,7 +43,7 @@ class FillOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &out =
         detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
                                 "Cannot find variable %s", Output("Out"))
@@ -51,12 +52,11 @@ class FillOp : public framework::OperatorBase {
     auto dtype = static_cast<framework::proto::DataType>(Attr<int>("dtype"));
     platform::CPUPlace cpu;
     auto force_cpu = Attr<bool>("force_cpu");
-    out.mutable_data(force_cpu ? cpu : dev_ctx.GetPlace(),
-                     framework::ToTypeIndex(dtype));
+    out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype));
 
     framework::LoDTensor tensor;
 
-    if (force_cpu || platform::is_cpu_place(dev_ctx.GetPlace())) {
+    if (force_cpu || platform::is_cpu_place(place)) {
       tensor.ShareDataWith(out);
     } else {
       // Always make tensor in CPU memory.
@@ -67,9 +67,11 @@ class FillOp : public framework::OperatorBase {
     framework::VisitDataType(
         dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
 
-    if (!force_cpu && platform::is_gpu_place(dev_ctx.GetPlace())) {
+    if (!force_cpu && platform::is_gpu_place(place)) {
       // Copy tensor to out
-      framework::CopyFrom(tensor, dev_ctx.GetPlace(), dev_ctx, &out);
+      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      auto &dev_ctx = *pool.Borrow(place);
+      framework::CopyFrom(tensor, place, dev_ctx, &out);
     }
   }
 };
diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc
index 789c92102d63355a80c3330f2107c731206397f4..3988ac12c740282a37d912c98a60de09e88f89bb 100644
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@@ -52,7 +52,7 @@ class IncrementOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &out =
         *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
diff --git a/paddle/operators/is_empty_op.cc b/paddle/operators/is_empty_op.cc
index 3616a0414f9e889376f8ba46e7567d7171eff3bf..545f87d4ed95ba51d133747f72ee18d7b2a88105 100644
--- a/paddle/operators/is_empty_op.cc
+++ b/paddle/operators/is_empty_op.cc
@@ -29,7 +29,7 @@ class IsEmptyOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     // get input
     auto *var = scope.FindVar(Input(kInput));
     PADDLE_ENFORCE_NOT_NULL(var);
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index 6c51dad27a4d9cd9e48b8591b1f14472c83ceaf1..ae6515bb12a9c3f523b23280296f36a0998fc0b2 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -11,10 +11,10 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
+#include <fstream>
 
 #include "paddle/framework/op_registry.h"
-
-#include <fstream>
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -26,7 +26,7 @@ class LoadOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     std::ifstream fin(filename);
     PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
@@ -40,7 +40,9 @@ class LoadOp : public framework::OperatorBase {
     auto *tensor = out_var->GetMutable<framework::LoDTensor>();
     framework::DeserializeFromStream(fin, tensor);
 
-    auto place = dev_ctx.GetPlace();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
     if (platform::is_gpu_place(place)) {
       // copy CPU to GPU
       framework::LoDTensor cpu_tensor;
diff --git a/paddle/operators/lod_array_length_op.cc b/paddle/operators/lod_array_length_op.cc
index cc8593810baf83e12368e67ceaeef0631e35c051..d71cb028bca04e555f9802895fe7d7d96cb27859 100644
--- a/paddle/operators/lod_array_length_op.cc
+++ b/paddle/operators/lod_array_length_op.cc
@@ -26,7 +26,7 @@ class LoDArrayLengthOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
     auto &out =
         *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc
index 2d67046bfee01d8d148da1c8b705d3ad959a4839..c351ad8fef71c78681dffcba052f0f2ce15562d1 100644
--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -24,7 +24,7 @@ class LoDRankTableOp : public framework::OperatorBase {
                  const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
index 643f8859f3d0d44c0b5be922bd786ab04093df94..c7b9057f8da65d0c7663fa9178f20eb7b97b53e2 100644
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -32,7 +33,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
                      const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
                           Input("X"))
                   .Get<framework::LoDTensor>();
@@ -86,6 +87,10 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
         // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
         auto slice = out[i].Slice(static_cast<int>(offset),
                                   static_cast<int>(offset + len));
+
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        auto &dev_ctx = *pool.Borrow(place);
+
         framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
                                     static_cast<int>(each_range.end)),
                             x.place(), dev_ctx, &slice);
diff --git a/paddle/operators/max_sequence_len_op.cc b/paddle/operators/max_sequence_len_op.cc
index dec2874a1fd13c1379e37d7b9755d465ffb1a6f7..8d629fe7355f5313d1a802eb0823aa9dc1a99eb5 100644
--- a/paddle/operators/max_sequence_len_op.cc
+++ b/paddle/operators/max_sequence_len_op.cc
@@ -28,7 +28,7 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto &rank_table =
         scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
     auto *out =
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
index 5edf29c3af958f5a939fdb830d46aca4b8d3dbe0..2287f347910e83c25d2155b80670f9d991c1e5b2 100644
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -28,7 +28,11 @@ class MergeLoDTensorOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(dev_place);
+
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
     auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index e19f534f8a2d05cd9b569a0eebb287db3d3321ba..368d2bfaa1014c2b4c3bf61cc46e563c32e0b4fe 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -24,7 +24,7 @@ class NCCLInitOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     const auto &name = Output("Communicator");
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
                             "Can not find variable '%s' in the scope.", name);
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
index c1046aadafbde303a3a8b12f2377018396b9adb8..b6e4ccb73f8133a55c17ddfa93b3ab4a21496561 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -22,6 +22,7 @@
 #include <vector>
 
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/init.h"
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/program_desc.h"
@@ -49,7 +50,7 @@ const f::DDim kDims = {100, 100};
 class NCCLTester : public ::testing::Test {
  public:
   virtual void SetUp() override {
-    cpu_ctx = new p::CPUDeviceContext(p::CPUPlace());
+    paddle::platform::CPUPlace cpu_place;
     for (size_t i = 0; i < gpu_list.size(); ++i) {
       p::GPUPlace place(i);
       dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
@@ -65,6 +66,7 @@ class NCCLTester : public ::testing::Test {
   }
 
   void NCCLInitOp() {
+    paddle::platform::CPUPlace cpu_place;
     std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
 
     op1->SetType("ncclInit");
@@ -76,7 +78,7 @@ class NCCLTester : public ::testing::Test {
 
     auto op = f::OpRegistry::CreateOp(*op1);
     VLOG(1) << "invoke NCCLInitOp.";
-    op->Run(g_scope, *cpu_ctx);
+    op->Run(g_scope, cpu_place);
     VLOG(1) << "NCCLInitOp finished.";
   }
 
@@ -111,13 +113,12 @@ class NCCLTester : public ::testing::Test {
     VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
     VLOG(1) << " send_tensor : " << send_tensor->numel()
             << " recv_tensor : " << recv_tensor->numel();
-    op->Run(*scope, *ctx);
+    op->Run(*scope, place);
     VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
   }
 
  public:
   std::vector<p::DeviceContext *> dev_ctxs;
-  p::DeviceContext *cpu_ctx;
   f::Scope g_scope;
   std::mutex mu;
 };
@@ -131,14 +132,14 @@ TEST(NCCL, ncclInitOp) {
   op_desc->SetAttr("gpus", {gpu_list});
 
   f::Scope g_scope;
-  std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace()));
+  paddle::platform::CPUPlace cpu_place;
 
   auto *var = g_scope.Var("x1");
   var->GetMutable<p::Communicator>();
 
   auto op = f::OpRegistry::CreateOp(*op_desc);
   VLOG(1) << "invoke NCCLInitOp.";
-  op->Run(g_scope, *ctx.get());
+  op->Run(g_scope, cpu_place);
   VLOG(1) << "NCCLInitOp finished.";
 }
 
@@ -294,9 +295,18 @@ int main(int argc, char **argv) {
     return 0;
   }
 
-  for (int i = 0; i < dev_count; ++i) {
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::GPUPlace(i));
     gpu_list.emplace_back(i);
   }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Create(places);
+
   testing::InitGoogleTest(&argc, argv);
 
   // device context should be release before scope.
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
index 8935751f15ccc4861c9e06d8d9031c8dff1a4af3..85d0153b32c0ba53bfe0912fc2682c8b635ba172 100644
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -65,9 +65,9 @@ class NetOp : public framework::OperatorBase {
    * will be used.
    */
   void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& place) const override {
     for (auto& op : ops_) {
-      op->Run(scope, dev_ctx);
+      op->Run(scope, place);
     }
   }
 
diff --git a/paddle/operators/net_op_test.cc b/paddle/operators/net_op_test.cc
index 22fba9568d018586b4622884b7d6145fd646adb0..dfd86546e83a6276aedd198eaeb6fad2c50944df 100644
--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -13,8 +13,7 @@ class TestOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
   DEFINE_OP_CLONE_METHOD(TestOp);
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+  void Run(const Scope& scope, const platform::Place& place) const override {
     ++run_cnt;
   }
 };
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 5981d5745d24e0b2fe68bf8b9852cb8a6094885f..77f3a40b76334c7c8499323ec2794f178a14217a 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -227,14 +227,15 @@ class RecurrentOp : public RecurrentBase {
       : RecurrentBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
     VLOG(3) << "Static RNN input sequence length = " << seq_len;
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
-    framework::Executor executor(dev_ctx);
+    framework::Executor executor(place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
     auto *program = block->Program();
 
     for (size_t i = 0; i < seq_len; ++i) {
@@ -270,6 +271,10 @@ class RecurrentOp : public RecurrentBase {
       executor.Run(*program, &cur_scope, block->ID(),
                    false /*create_local_scope*/);
 
+      // get device context from pool
+      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      auto &dev_ctx = *pool.Borrow(place);
+
       // Copy inside::output -> outside::output
       //    outside::output[seq_offset: seq_offset + 1] = inside::output
       this->LinkTensorWithCallback(
@@ -278,14 +283,13 @@ class RecurrentOp : public RecurrentBase {
               framework::LoDTensor *dst_tensor) {
             if (i == 0) {  // create output tensor at begin
               dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims()));
-              dst_tensor->mutable_data(dev_ctx.GetPlace(), src_tensor.type());
+              dst_tensor->mutable_data(place, src_tensor.type());
             }
 
             auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
             // Explicit copy output since the local RNN scope can be destroyed
             // early.
-            framework::CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx,
-                                &dst_out);
+            framework::CopyFrom(src_tensor, place, dev_ctx, &dst_out);
           });
 
       scopes.Next();
@@ -311,15 +315,20 @@ class RecurrentGradOp : public RecurrentBase {
       : RecurrentBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto seq_len = static_cast<size_t>(GetSequenceLength(scope));
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
-    framework::Executor executor(dev_ctx);
+    framework::Executor executor(place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
     auto *program = block->Program();
 
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
     for (size_t step_id = 0; step_id < seq_len; ++step_id) {
       size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
       VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
@@ -366,8 +375,7 @@ class RecurrentGradOp : public RecurrentBase {
           auto *cur_grad_var = cur_scope.Var(cur_grad);
           auto cur_grad_tensor =
               cur_grad_var->GetMutable<framework::LoDTensor>();
-          framework::CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx,
-                              cur_grad_tensor);
+          framework::CopyFrom(ex_tensor, place, dev_ctx, cur_grad_tensor);
         }
       }
 
@@ -410,7 +418,7 @@ class RecurrentGradOp : public RecurrentBase {
             auto zero_op = framework::OpRegistry::CreateOp(
                 "fill_constant", framework::VariableNameMap{},
                 {{"Out", {pg_names[param_id]}}}, attrs);
-            zero_op->Run(scope, dev_ctx);
+            zero_op->Run(scope, place);
           }
 
           auto new_inside_name = cur_scope.Rename(inside_grad_name);
@@ -419,7 +427,7 @@ class RecurrentGradOp : public RecurrentBase {
           auto sum_op = framework::OpRegistry::CreateOp(
               "sum", {{"X", {pg_names[param_id], new_inside_name}}},
               {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
-          sum_op->Run(cur_scope, dev_ctx);
+          sum_op->Run(cur_scope, place);
 
           cur_scope.Rename(new_inside_name, inside_grad_name);
         }
@@ -437,11 +445,11 @@ class RecurrentGradOp : public RecurrentBase {
             }
             if (step_id == 0) {  // alloc memory
               outside->Resize(PrependDims(seq_len, inside.dims()));
-              outside->mutable_data(dev_ctx.GetPlace(), inside.type());
+              outside->mutable_data(place, inside.type());
             }
 
             auto dst = outside->Slice(seq_offset, seq_offset + 1);
-            framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, &dst);
+            framework::CopyFrom(inside, place, dev_ctx, &dst);
           });
       VLOG(5) << "Link outside gradient finished ";
 
@@ -453,8 +461,8 @@ class RecurrentGradOp : public RecurrentBase {
             [&](const framework::LoDTensor &inside,
                 framework::LoDTensor *outside) {
               outside->Resize(inside.dims());
-              outside->mutable_data(dev_ctx.GetPlace(), inside.type());
-              framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, outside);
+              outside->mutable_data(place, inside.type());
+              framework::CopyFrom(inside, place, dev_ctx, outside);
             });
         VLOG(5) << "Link initialize state gradient finished ";
       }
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
index 4e91d1151ebf7a0cd520f2f1fa58a0cc4a0d1bef..89196f27a39bbaf49338cd725942711a7c6acccb 100644
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -73,7 +73,7 @@ class RecvOp : public framework::OperatorBase {
   }
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     // FIXME(typhoonzero): no new scopes for every run.
     framework::Scope &recv_scope = scope.NewScope();
     rpc_service_->SetScope(&recv_scope);
@@ -113,7 +113,9 @@ class RecvOp : public framework::OperatorBase {
         auto *var = recv_scope.Var(grad_var_name);
         auto *tensor = var->GetMutable<framework::LoDTensor>();
         // FIXME(typhoonzero): do not copy
-        framework::CopyFrom(v.second, dev_ctx.GetPlace(), dev_ctx, tensor);
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        auto &dev_ctx = *pool.Borrow(place);
+        framework::CopyFrom(v.second, place, dev_ctx, tensor);
       }
       rpc_service_->Reset();
 
@@ -121,7 +123,7 @@ class RecvOp : public framework::OperatorBase {
       framework::proto::ProgramDesc program_desc;
       program_desc.ParseFromString(program_str);
       framework::ProgramDesc program(program_desc);
-      framework::Executor executor(dev_ctx);
+      framework::Executor executor(place);
       // Run sub graph to get optimized tensor
       try {
         executor.Run(program, &recv_scope, 0, /*global_block*/
diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
index 5e3079ee0c91c337dca1e57729438fb9be4a0ff4..09d3ccc356b0c485474352c315460d02a603c985 100644
--- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
@@ -12,9 +12,10 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <paddle/framework/lod_rank_table.h>
+#include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -53,7 +54,7 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
                                   const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto &x =
         detail::Ref(scope.FindVar(Input("X")),
                     "Cannot find input lod tensor variable %s", Input("X"))
@@ -69,11 +70,11 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
 
     out.Resize(x.dims());
     out.mutable_data(x.place(), x.type());
-    this->process(dev_ctx, x, rank_table, &out);
+    this->process(place, x, rank_table, &out);
   }
 
  protected:
-  virtual void process(const platform::DeviceContext &dev_ctx,
+  virtual void process(const platform::Place &place,
                        const framework::LoDTensor &x,
                        const framework::LoDRankTable &rank_table,
                        framework::LoDTensor *out) const = 0;
@@ -104,7 +105,7 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
     return absolute_table;
   }
 
-  size_t CopyTensorAndLod(const platform::DeviceContext &dev_ctx,
+  size_t CopyTensorAndLod(const platform::Place &place,
                           const AbsoluteRankTableItem &item,
                           const framework::LoDTensor &x,
                           framework::LoDTensor *out, size_t out_offset) const {
@@ -130,6 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
     auto x_sliced = x.Slice(x_offset, x_offset + len);
     auto out_sliced = out->Slice(out_offset, out_offset + len);
 
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
     framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
     out_offset += len;
     return out_offset;
@@ -145,8 +148,7 @@ class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
       : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
 
  protected:
-  void process(const platform::DeviceContext &dev_ctx,
-               const framework::LoDTensor &x,
+  void process(const platform::Place &place, const framework::LoDTensor &x,
                const framework::LoDRankTable &rank_table,
                framework::LoDTensor *out) const override {
     auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
@@ -154,7 +156,7 @@ class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
     out->mutable_lod()->clear();
     for (auto &item : rank_table.items()) {
       PADDLE_ENFORCE_LT(item.index, absolute_table.size());
-      out_offset = CopyTensorAndLod(dev_ctx, absolute_table[item.index], x, out,
+      out_offset = CopyTensorAndLod(place, absolute_table[item.index], x, out,
                                     out_offset);
     }
   }
@@ -192,8 +194,7 @@ class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase {
       : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
 
  protected:
-  void process(const platform::DeviceContext &dev_ctx,
-               const framework::LoDTensor &x,
+  void process(const platform::Place &place, const framework::LoDTensor &x,
                const framework::LoDRankTable &rank_table,
                framework::LoDTensor *out) const override {
     auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
@@ -214,7 +215,7 @@ class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase {
     // Copy TensorAndLod
     size_t out_offset = 0;
     for (auto &offset : offsets) {
-      out_offset = this->CopyTensorAndLod(dev_ctx, absolute_table[offset.first],
+      out_offset = this->CopyTensorAndLod(place, absolute_table[offset.first],
                                           x, out, out_offset);
     }
   }
diff --git a/paddle/operators/rnn_memory_helper_op.cc b/paddle/operators/rnn_memory_helper_op.cc
index 795bdf3e51a2dd323e85c497fcf203ad3ed54183..edd475ec39723690fe17d34ff4d7a7a5ebbc3840 100644
--- a/paddle/operators/rnn_memory_helper_op.cc
+++ b/paddle/operators/rnn_memory_helper_op.cc
@@ -25,7 +25,7 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
                     const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto mem_var_name = Input("X");
     auto *mem_var = scope.FindVar(mem_var_name);
     PADDLE_ENFORCE(mem_var != nullptr,
@@ -77,7 +77,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
                         const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto out_grad_var_name = Input(framework::GradVarName("Out"));
     auto *out_grad_var = scope.FindVar(out_grad_var_name);
 
@@ -100,7 +100,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
 
       auto zero_op = framework::OpRegistry::CreateOp(
           "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs);
-      zero_op->Run(scope, dev_ctx);
+      zero_op->Run(scope, dev_place);
     } else {
       auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
       auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/operators/save_load_op_test.cc b/paddle/operators/save_load_op_test.cc
index a57466a48d4d6016fe2618d19fdca4c4f667124a..6606613d738aeea3b4496d102f014fa44431e8c8 100644
--- a/paddle/operators/save_load_op_test.cc
+++ b/paddle/operators/save_load_op_test.cc
@@ -21,7 +21,7 @@ USE_NO_KERNEL_OP(load);
 TEST(SaveLoadOp, CPU) {
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace place;
-  paddle::platform::CPUDeviceContext ctx(place);
+
   auto var = scope.Var("test_var");
   auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
   tensor->Resize({10, 10});
@@ -42,13 +42,13 @@ TEST(SaveLoadOp, CPU) {
 
   auto save_op = paddle::framework::OpRegistry::CreateOp(
       "save", {{"X", {"test_var"}}}, {}, attrs);
-  save_op->Run(scope, ctx);
+  save_op->Run(scope, place);
 
   auto load_var = scope.Var("out_var");
   auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
   auto load_op = paddle::framework::OpRegistry::CreateOp(
       "load", {}, {{"Out", {"out_var"}}}, attrs);
-  load_op->Run(scope, ctx);
+  load_op->Run(scope, place);
   int* actual = target->data<int>();
   for (int64_t i = 0; i < tensor->numel(); ++i) {
     EXPECT_EQ(expect[i], actual[i]);
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
index eae1146d6c61fe56ebc48ac50e1eacd62e3fa7d0..f763b8d6bf8b6089c7d97b153580df30b56692a4 100644
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -21,6 +21,7 @@
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -62,7 +63,7 @@ class SaveOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
 
@@ -88,6 +89,11 @@ class SaveOp : public framework::OperatorBase {
                    "SaveOp only support LoDTensor, %s has wrong type", iname);
 
     auto &tensor = var->Get<framework::LoDTensor>();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
     framework::SerializeToStream(fout, tensor, dev_ctx);
   }
 };
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
index 48194a547bbea5ddda7c5f3e2421431d1d81042d..3ee6bd190d8f0473c2fb1f9e3e41af4a4d6353bc 100644
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -27,11 +27,11 @@ class ShrinkRNNMemoryOp : public ArrayOp {
       : ArrayOp(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x_var = scope.FindVar(Input("X"));
     PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
     auto &x_tensor = x_var->Get<framework::LoDTensor>();
-    size_t offset = this->GetOffset(scope, dev_ctx);
+    size_t offset = this->GetOffset(scope, place);
     auto *rank_table_var = scope.FindVar(Input("RankTable"));
     PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set");
     auto &rank_table = rank_table_var->Get<framework::LoDRankTable>();
@@ -93,7 +93,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
       : ArrayOp(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
     auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
     PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
@@ -105,6 +105,10 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
     dx_tensor.Resize(x_tensor.dims());
     dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
 
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
     if (dout_var == nullptr) {  // dx_tensor fill zero
       math::set_constant(dev_ctx, &dx_tensor, 0.0f);
     } else {
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
index 3542d8624fec49f75314f046434cbcadf307497e..89826ca6ee98d579f8b7c8795b6dc33cfa158ee1 100644
--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -33,7 +34,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
     auto *out_true =
@@ -44,6 +45,9 @@ class SplitLoDTensorOp : public framework::OperatorBase {
     auto &x_lod = x.lod();
     auto &mask_dim = mask.dims();
 
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(dev_place);
+
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index 90cbc19d1b1bab2e639e3d6d5b28cd13b30542f6..2ee9bf700c260124c34c225848f4941eadf443c2 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -25,11 +25,11 @@ class WriteToArrayOp : public ArrayOp {
       : ArrayOp(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     if (x == nullptr) return;
     auto &x_tensor = x->Get<framework::LoDTensor>();
-    size_t offset = GetOffset(scope, dev_ctx);
+    size_t offset = GetOffset(scope, place);
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
     if (offset >= out->size()) {
@@ -39,7 +39,11 @@ class WriteToArrayOp : public ArrayOp {
     }
     if (x_tensor.memory_size() > 0) {
       auto *out_tensor = &out->at(offset);
-      CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor);
+
+      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      auto &dev_ctx = *pool.Borrow(place);
+
+      CopyFrom(x_tensor, place, dev_ctx, out_tensor);
       out_tensor->set_lod(x_tensor.lod());
     } else {
       VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
@@ -119,17 +123,18 @@ class ReadFromArrayOp : public ArrayOp {
                   const framework::AttributeMap &attrs)
       : ArrayOp(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     PADDLE_ENFORCE(x != nullptr, "X must be set");
     auto &x_array = x->Get<framework::LoDTensorArray>();
     auto *out = scope.FindVar(Output("Out"));
     PADDLE_ENFORCE(out != nullptr, "Out must be set");
     auto *out_tensor = out->GetMutable<framework::LoDTensor>();
-    size_t offset = GetOffset(scope, dev_ctx);
+    size_t offset = GetOffset(scope, place);
     if (offset < x_array.size()) {
-      framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx,
-                          out_tensor);
+      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      auto &dev_ctx = *pool.Borrow(place);
+      framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor);
       out_tensor->set_lod(x_array[offset].lod());
     } else {
       VLOG(10) << "offset " << offset << " >= " << x_array.size();
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index 324c8b98c4811328b2a89eadc3e3420c080bd7d1..11ee96faad5aba0c2dbc13937d0c060aea98078a 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -40,13 +40,14 @@ class WhileOp : public framework::OperatorBase {
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
     auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
     PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
 
-    framework::Executor executor(dev_ctx);
+    framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
     auto *program = block->Program();
 
     auto step_scopes =
@@ -97,8 +98,8 @@ class WhileGradOp : public framework::OperatorBase {
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
-    framework::Executor executor(dev_ctx);
+           const platform::Place &dev_place) const override {
+    framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
 
@@ -189,7 +190,7 @@ class WhileGradOp : public framework::OperatorBase {
             auto zero_op = framework::OpRegistry::CreateOp(
                 "fill_constant", framework::VariableNameMap{},
                 {{"Out", {pg_names[param_id]}}}, attrs);
-            zero_op->Run(scope, dev_ctx);
+            zero_op->Run(scope, dev_place);
           }
         }
 
@@ -197,7 +198,7 @@ class WhileGradOp : public framework::OperatorBase {
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {pg_names[param_id], new_inside_name}}},
             {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
-        sum_op->Run(cur_scope, dev_ctx);
+        sum_op->Run(cur_scope, dev_place);
         cur_scope.Rename(new_inside_name, inside_grad_name);
       }
     }
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 88df28a9668e5f354d115ff8ab32cb21e03aefb5..f0a0ea70a0aa14e1db959e4e6ace2a44363d0c35 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -25,7 +25,7 @@ ENDIF()
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
     system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
-nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
+nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index dacee74fff369586c7ca2ff62cfe6aeebd8f39c7..a28e9de716c857145955cced85b99b77ef89b101 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -15,6 +15,59 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+DeviceContextPool* DeviceContextPool::pool = nullptr;
+
+const platform::DeviceContext* DeviceContextPool::Borrow(
+    const platform::Place& place) {
+  auto it = device_contexts_.find(place);
+  if (it == device_contexts_.end()) {
+    PADDLE_THROW(
+        "'Place' is not supported, Please re-compile with WITH_GPU "
+        "option");
+  }
+  return it->second;
+}
+
+std::vector<const platform::DeviceContext*> DeviceContextPool::Borrow(
+    const std::vector<platform::Place>& places) {
+  PADDLE_ENFORCE_GT(places.size(), 0);
+  PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
+  std::vector<const platform::DeviceContext*> borrowed_contexts;
+  for (auto& place : places) {
+    auto it = device_contexts_.find(place);
+    if (it != device_contexts_.end()) {
+      borrowed_contexts.emplace_back(it->second);
+    } else {
+      PADDLE_THROW(
+          "'Place' is not supported, Please re-compile with WITH_GPU "
+          "option");
+    }
+  }
+  return borrowed_contexts;
+}
+
+DeviceContextPool::DeviceContextPool(
+    const std::vector<platform::Place>& places) {
+  PADDLE_ENFORCE_GT(places.size(), 0);
+  for (size_t i = 0; i < places.size(); i++) {
+    if (platform::is_cpu_place(places[i])) {
+      device_contexts_.emplace(places[i],
+                               new platform::CPUDeviceContext(
+                                   boost::get<platform::CPUPlace>(places[i])));
+    } else if (platform::is_gpu_place(places[i])) {
+#ifdef PADDLE_WITH_CUDA
+      device_contexts_.emplace(places[i],
+                               new platform::CUDADeviceContext(
+                                   boost::get<platform::GPUPlace>(places[i])));
+#else
+      PADDLE_THROW(
+          "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
+          "option");
+#endif
+    }
+  }
+}
+
 CPUDeviceContext::CPUDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 6cc0508522a97f3097b30e3340e7413a7093714a..1d46ce5c7031c2a27dde42c838ff444ce4ac6f54 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -11,8 +11,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
+#include <memory>
+#include <unordered_map>
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/platform/dynload/cublas.h"
@@ -20,10 +20,13 @@ limitations under the License. */
 #include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
-#include <memory>
+
+#include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
+#include "glog/logging.h"
+
 namespace paddle {
 namespace platform {
 
@@ -105,5 +108,51 @@ class CUDNNDeviceContext : public CUDADeviceContext {
 
 #endif
 
+/*! \brief device context pool singleton */
+class DeviceContextPool {
+ public:
+  explicit DeviceContextPool(const std::vector<platform::Place>& places);
+
+  static DeviceContextPool& Get() {
+    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
+    return *pool;
+  }
+
+  /*! \brief  Create should only called by Init function */
+  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
+    if (pool == nullptr) {
+      pool = new DeviceContextPool(places);
+    }
+    return *pool;
+  }
+
+  /*! \brief  Return handle of single device context. */
+  const platform::DeviceContext* Borrow(const platform::Place& place);
+
+  /*! \brief  Return handle of multi-device context. */
+  std::vector<const platform::DeviceContext*> Borrow(
+      const std::vector<platform::Place>& places);
+
+  ~DeviceContextPool() {}
+
+ private:
+  static DeviceContextPool* pool;
+  struct Hash {
+    std::hash<int> hash_;
+    size_t operator()(const platform::Place& place) const {
+      int pre_hash = place.which()
+                     << (sizeof(int) * 8 - NUM_PLACE_TYPE_LIMIT_IN_BIT);
+      if (platform::is_gpu_place(place)) {
+        pre_hash += boost::get<platform::GPUPlace>(place).GetDeviceId();
+      }
+      return hash_(pre_hash);
+    }
+  };
+  std::unordered_map<const platform::Place, const platform::DeviceContext*,
+                     Hash>
+      device_contexts_;
+  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
+};
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cu
similarity index 58%
rename from paddle/platform/device_context_test.cc
rename to paddle/platform/device_context_test.cu
index 109c13a8812dffac10d202cbc9d85c4e601bf197..f046c79e0a015023568071a157ae183bfb8df556 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cu
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/device_context.h"
 #include "gtest/gtest.h"
+#include "paddle/platform/device_context.h"
+
+#include "glog/logging.h"
 
 TEST(Device, Init) {
   using paddle::platform::DeviceContext;
@@ -62,3 +64,54 @@ TEST(Device, CUDNNDeviceContext) {
     }
   }
 }
+
+TEST(Device, DeviceContextPool) {
+  using paddle::platform::DeviceContextPool;
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::Place;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::GPUPlace;
+
+  DeviceContextPool& pool = DeviceContextPool::Get();
+  auto cpu_dev_ctx1 = pool.Borrow(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Borrow(CPUPlace());
+  EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1);
+
+  std::vector<Place> gpu_places;
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    gpu_places.emplace_back(GPUPlace(i));
+  }
+  auto dev_ctxs = pool.Borrow(gpu_places);
+  for (size_t i = 0; i < dev_ctxs.size(); ++i) {
+    auto* dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctxs[i]);
+
+    // check same as GPUPlace(i)
+    GPUPlace place = boost::get<GPUPlace>(dev_ctx->GetPlace());
+    EXPECT_EQ(place.GetDeviceId(), static_cast<int>(i));
+  }
+}
+
+int main(int argc, char** argv) {
+  int dev_count = paddle::platform::GetCUDADeviceCount();
+  if (dev_count <= 1) {
+    LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA "
+                    "device count is "
+                 << dev_count;
+    return 0;
+  }
+
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::GPUPlace(i));
+  }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Create(places);
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
index 11007c1031c6d224c475e9ef4f11e7797decd78e..cb31e00b8e2540008f83bbe274b6fae2a03f8a71 100644
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
@@ -63,6 +63,8 @@ extern void LoadNCCLDSO();
   __macro(ncclAllReduce);               \
   __macro(ncclBcast);                   \
   __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
   __macro(ncclReduce);                  \
   __macro(ncclGetErrorString);
 
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 5abd4d4a345ed2750231841325f2b19a2ee8c4c9..d1c7be0790b5e11d6273efe6c08cdb7bf22425c6 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 
+#include "paddle/platform/macros.h"
 #include "paddle/string/printf.h"
 #include "paddle/string/to_string.h"
 
diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu
index 94ab360a1967d22e73bc6aefc0301487537c97f7..6750c8da7db86500e9593cd41d39dbd229abad7a 100644
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -12,17 +12,19 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#include <thrust/device_vector.h>
+#include <memory>
+#include <vector>
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/dynload/nccl.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/gpu_info.h"
 
-#include <thrust/device_vector.h>
-#include <memory>
-#include <vector>
-
 static int dev_count = 0;
 
 namespace paddle {
@@ -31,7 +33,8 @@ namespace platform {
 TEST(NCCL, init) {
   std::vector<ncclComm_t> comms;
   comms.resize(dev_count);
-  dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
+  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
+
   for (int i = 0; i < dev_count; ++i) {
     dynload::ncclCommDestroy(comms[i]);
   }
@@ -131,6 +134,18 @@ int main(int argc, char** argv) {
         << dev_count;
     return 0;
   }
+
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::GPUPlace(i));
+  }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Create(places);
+
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index ca98920d414bc87ce243995a42e5672d0e61e108..6bff2d4d9cd7eefaa7212af2a1287e9aaff7d684 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -60,12 +60,14 @@ struct IsGPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const MKLDNNPlace &) const { return false; }
   bool operator()(const GPUPlace &gpu) const { return true; }
+  bool operator()(const CUDNNPlace &) const { return true; }
 };
 
 struct IsMKLDNNPlace : public boost::static_visitor<bool> {
   bool operator()(const MKLDNNPlace &) const { return true; }
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const GPUPlace &) const { return false; }
+  bool operator()(const CUDNNPlace &) const { return false; }
 };
 
 // Define the max number of Place in bit length. i.e., the max number of places
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 2d7fe251416dce629dd0a2318aaa020ec9668d9b..de6b24f70d84a28add0c0a09cac79b8c5b1044de 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -360,10 +360,10 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
-              const platform::DeviceContext &dev_ctx) {
-             self.Run(scope, dev_ctx);
-             dev_ctx.Wait();
-           })
+              const platform::CPUPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::GPUPlace &place) { self.Run(scope, place); })
       .def("type",
            [](const OperatorBase &op) -> std::string { return op.Type(); })
       .def("outputs",
@@ -417,7 +417,7 @@ All parameter, weight, gradient are variables in Paddle.
            });
 
   py::class_<framework::Executor>(m, "Executor")
-      .def(py::init<std::vector<platform::Place> &>())
+      .def(py::init<const platform::Place &>())
       .def("run", &Executor::Run);
 
   m.def("unique_integer", UniqueIntegerGenerator);
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 268a0f2fa386adf99f7ea1589ff1f301f943a68b..413fd9b046f3f302feb5bd52beb284553a8ae192 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -14,9 +14,9 @@
 
 #pragma once
 #include <string>
-#include "paddle/framework/executor.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
@@ -63,8 +63,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
             tensor.dims(), platform::CPUPlace()));
 
-        framework::DeviceContextPool &pool =
-            framework::DeviceContextPool::Get();
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
         auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
             pool.Borrow(tensor.place()));
 
@@ -138,7 +137,7 @@ void PyCUDATensorSetFromArray(
   self.Resize(framework::make_ddim(dims));
   auto *dst = self.mutable_data<T>(place);
 
-  framework::DeviceContextPool &pool = framework::DeviceContextPool::Get();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
   auto dev_ctx =
       static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place));
   paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 8132742749e4a622720c66692c8d09815714ebea..77f84cd43bdf35ae6f54b0db2b5f720d24872878 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -6,7 +6,6 @@ if(WITH_TESTING)
   add_library(paddle_test_util STATIC TestUtil.cpp)
   add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
   if(NOT MOBILE_INFERENCE)
-    add_library(paddle_gtest_main STATIC paddle_gtest_main.cc)
-    add_dependencies(paddle_gtest_main paddle_memory gtest gflags)
+    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init paddle_memory gtest gflags)
   endif()
 endif()
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index a491322b7e533f7a9c263a249494440269391003..7ba1bf095ab74e4b64a8fb39b84172d6f371a2cf 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <cstring>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/init.h"
 #include "paddle/memory/memory.h"
 
 int main(int argc, char** argv) {
@@ -32,8 +34,11 @@ int main(int argc, char** argv) {
   google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
   testing::InitGoogleTest(&argc, argv);
   paddle::memory::Used(paddle::platform::CPUPlace());
+  std::vector<std::string> devs = {"CPU"};
 #ifdef PADDLE_WITH_CUDA
   paddle::memory::Used(paddle::platform::GPUPlace(0));
+  devs.push_back("GPU:0");
 #endif
+  paddle::framework::InitDevices(devs);
   return RUN_ALL_TESTS();
 }
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 471255ef50a3be4739a89efbd978cdb4304d992d..051b9094aafa74b186776ae2041f95d0fe6d5f77 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -42,5 +42,10 @@ def __read_gflags_from_env__():
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
 
+    if core.is_compile_gpu():
+        core.init_devices(["CPU", "GPU:0"])
+    else:
+        core.init_devices(["CPU"])
+
 
 __read_gflags_from_env__()
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index 4b4a0820abb9a85f3e9936190c835c2f186107b3..cdd576294f4f53bd3760b2c95a41b2129004a51a 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -47,13 +47,14 @@ class Executor(object):
             act_places.append(p)
 
         # TODO(dzhwinter) : consider that our fluid tests all written in 
-        # GPUPlace(gpu_id), this will be changed in next PR.
+        # GPUPlace(gpu_id), this will be changed in the future
         if core.is_compile_gpu():
             core.init_devices(["CPU", "GPU:0"])
         else:
             core.init_devices(["CPU"])
 
-        self.executor = core.Executor(act_places)
+        # TODO(dzhwinter) : only use the first place
+        self.executor = core.Executor(act_places[0])
         self.places = places
 
     def aslodtensor(self, data):
diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
index e83c4a0622013cbfebdf39434ef252412697acb1..087283bfded07e25ddfd446849b9c5ca9d1e7651 100644
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -90,12 +90,10 @@ def get_numeric_gradient(scope,
     def product(dim):
         return reduce(lambda a, b: a * b, dim, 1)
 
-    ctx = core.DeviceContext.create(core.CPUPlace())
-
     def get_output():
         sum = []
         for output_name in output_names:
-            op.run(scope, ctx)
+            op.run(scope, core.CPUPlace())
             sum.append(
                 np.array(scope.find_var(output_name).get_tensor()).mean())
         return np.array(sum).mean()
diff --git a/python/paddle/v2/fluid/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py
index 903e84c32887100bbeef6ebf81f66f06f084fab5..1ff3932164bed75be71b5c6b7114df362b893f09 100644
--- a/python/paddle/v2/fluid/tests/test_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py
@@ -113,8 +113,7 @@ class TestSparseAdagradOp(unittest.TestCase):
             LearningRate='LearningRate',
             epsilon=2.0)
 
-        ctx = core.DeviceContext.create(place)
-        adagrad_op.run(scope, ctx)
+        adagrad_op.run(scope, place)
 
         # get and compare moment result
         moment_result_array = np.array(moment)
diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
index a9c0b1cfd3417d0583fa9d4e15550e7543a6bd19..dfc047e1f0dc9fcf3d72d007b17d4c2de2077fbd 100644
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -296,8 +296,7 @@ class TestBatchNormOp(OpTest):
                 momentum=momentum,
                 epsilon=epsilon)
 
-            ctx = core.DeviceContext.create(place)
-            batch_norm_op.run(scope, ctx)
+            batch_norm_op.run(scope, place)
 
             # check forward result
             self.__assert_close(y_tensor, y_out, "y_out")
@@ -320,7 +319,7 @@ class TestBatchNormOp(OpTest):
                 ["y_out", "mean", "variance", "saved_mean", "saved_variance"],
                 place,
                 feed_dict={"y_out": y_grad})
-            batch_norm_op_grad.run(scope, ctx)
+            batch_norm_op_grad.run(scope, place)
 
             x_grad_tensor = create_or_get_tensor(scope,
                                                  grad_var_name("x_val"), None,
diff --git a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
index 5fad7d8cce5af3677aa77dc0abb64f1ecd380419..f329214dce407fe0382c51b29f0f4c33b562541a 100644
--- a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
@@ -57,8 +57,7 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
             SentenceIds="sentence_ids",
             SentenceScores="sentence_scores")
 
-        ctx = core.DeviceContext.create(self.cpu_place)
-        beam_search_decode_op.run(self.scope, ctx)
+        beam_search_decode_op.run(self.scope, self.cpu_place)
 
         expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
         self.assertEqual(sentence_ids.lod(), expected_lod)
diff --git a/python/paddle/v2/fluid/tests/test_beam_search_op.py b/python/paddle/v2/fluid/tests/test_beam_search_op.py
index cc7c09bb59de3f83e47b4d95c1203f7f050c5132..595f132fa85f0a65f15d9ac31ad320e567c96358 100644
--- a/python/paddle/v2/fluid/tests/test_beam_search_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_op.py
@@ -14,7 +14,6 @@ def create_tensor(scope, name, np_data):
 class BeamSearchOpTester(unittest.TestCase):
     def setUp(self):
         self.scope = core.Scope()
-        self.ctx = core.DeviceContext.create(core.CPUPlace())
         self._create_ids()
         self._create_scores()
         self._create_pre_ids()
@@ -32,7 +31,7 @@ class BeamSearchOpTester(unittest.TestCase):
             level=0,
             beam_size=2,
             end_id=0, )
-        op.run(self.scope, self.ctx)
+        op.run(self.scope, core.CPUPlace())
         selected_ids = self.scope.find_var("selected_ids").get_tensor()
         print 'selected_ids', np.array(selected_ids)
         print 'lod', selected_ids.lod()
diff --git a/python/paddle/v2/fluid/tests/test_cond_op.py b/python/paddle/v2/fluid/tests/test_cond_op.py
index 9d1df44b9065f8101e90b87815660f8c0818645f..32e54084e48cf77c569db4dee54a0c89d5108373 100644
--- a/python/paddle/v2/fluid/tests/test_cond_op.py
+++ b/python/paddle/v2/fluid/tests/test_cond_op.py
@@ -65,8 +65,7 @@ class TestCondOp(unittest.TestCase):
         self.create_global_variables()
         self.create_cond_op()
         self.create_sub_net()
-        ctx = core.DeviceContext.create(core.CPUPlace())
-        self.condop.run(self.scope, ctx)
+        self.condop.run(self.scope, core.CPUPlace())
         return np.array(self.scope.find_var("Out").get_tensor())
 
     def create_global_variables(self):
diff --git a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
index a9d943b8b7f7d9bc0dec89c5360769e0328527ba..4afe0c6a6d36b4ab7b88b459ce8d182b287b860e 100644
--- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
@@ -24,7 +24,6 @@ class TestGaussianRandomOp(unittest.TestCase):
 
     def gaussian_random_test(self, place):
 
-        context = core.DeviceContext.create(place)
         program = fluid.Program()
         block = program.global_block()
         vout = block.create_var(name="Out")
diff --git a/python/paddle/v2/fluid/tests/test_is_empty_op.py b/python/paddle/v2/fluid/tests/test_is_empty_op.py
index ed6e3fe24f6333c9c90d760787eb13241a7e1868..0a4dd0f4faf370161e5695d97f0ed4bf73b6ec26 100644
--- a/python/paddle/v2/fluid/tests/test_is_empty_op.py
+++ b/python/paddle/v2/fluid/tests/test_is_empty_op.py
@@ -33,8 +33,7 @@ class TestIsEmptyOp(unittest.TestCase):
 
     def one_case(self, input, target):
         op = Operator(type="is_empty", X=input, Out="out")
-        ctx = core.DeviceContext.create(core.CPUPlace())
-        op.run(self.scope, ctx)
+        op.run(self.scope, core.CPUPlace())
         out = self.scope.var("out").get_tensor()
         self.assertEqual(np.array(out)[0], target)
 
diff --git a/python/paddle/v2/fluid/tests/test_sgd_op.py b/python/paddle/v2/fluid/tests/test_sgd_op.py
index ca05a381f06cfd40b7939dbda8d4f1f4aacd0271..9c345792beef46f65ec12e111f1d645fb31e69c7 100644
--- a/python/paddle/v2/fluid/tests/test_sgd_op.py
+++ b/python/paddle/v2/fluid/tests/test_sgd_op.py
@@ -55,8 +55,7 @@ class TestSparseSGDOp(unittest.TestCase):
             Grad='Grad',
             ParamOut='Param',
             LearningRate='LearningRate')
-        ctx = core.DeviceContext.create(place)
-        sgd_op.run(scope, ctx)
+        sgd_op.run(scope, place)
 
         # get and compare result
         result_array = np.array(param)
diff --git a/python/paddle/v2/fluid/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
index 00b4f196209a6414f1063a33c0e31093e33ca39d..d6872c8ba351a13b6fb8622cc23029c8c5cbe2e1 100644
--- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
@@ -26,7 +26,6 @@ class TestUniformRandomOp(unittest.TestCase):
             self.uniform_random_test(place=core.GPUPlace(0))
 
     def uniform_random_test(self, place):
-        context = core.DeviceContext.create(place)
         program = fluid.Program()
         block = program.global_block()
         vout = block.create_var(name="Out")