diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
index d6601090d5b6150a5aa467210038d3693c3e67a8..682cff168d4d31e0565fc987604f97a671566fbd 100644
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -71,7 +71,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
     places.emplace_back(platform::CPUPlace());
     LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
   }
-  platform::DeviceContextPool::Create(places);
+  platform::DeviceContextPool::Init(places);
   return true;
 }
 
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index f48512b5c682698dae86593fb89a720eea503f7d..c0be11294c4a6b49ae4bc2f805f76e9f04508349 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -388,8 +388,8 @@ void OperatorWithKernel::Run(const Scope& scope,
                              const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
   this->InferShape(&infer_shape_ctx);
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
-  auto dev_ctx = pool.Borrow(place);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto dev_ctx = pool.Get(place);
 
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
index 741984bb68d3881f6ac26eaca7790190ed6e572a..ac217f1363dbd0360645bbe07cd71a17cc931a79 100644
--- a/paddle/gserver/layers/MKLDNNLRNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
@@ -29,7 +29,7 @@ bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
   }
 
   /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1UL);
+  CHECK_EQ(config_.inputs_size(), 1);
   const NormConfig& conf = config_.inputs(0).norm_conf();
   localSize_ = conf.size();
   alpha_ = conf.scale();
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
index 060ffac8273724fad48753d9a7e2b0affbb6f25e..e0eef5d9f93d70930ee82d663de9610cc0176e33 100644
--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@@ -35,8 +35,8 @@ class ArrayOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
 
     // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     size_t offset;
     if (platform::is_gpu_place(i_tensor.place())) {
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
index 0aa04c268bd65dc246341a2f335c362496050bdb..49366fee8df5a44a97b7b4e87cbf0b7c813a414a 100644
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -106,8 +106,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
         }
         auto slice = out->Slice(out_offset, out_offset + len);
 
-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-        auto &dev_ctx = *pool.Borrow(place);
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);
 
         framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
                             dev_ctx, &slice);
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
index 0560040509026e84eb543331996a6846751b8506..7d77be3be1034bb38f6c92c181aa525214073eec 100644
--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -82,8 +82,8 @@ class AssignOp : public framework::OperatorBase {
         out != nullptr,
         "The Output(Out) should not be null if the Input(X) is set.");
 
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
   }
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc
index 52c28e7f532f9751589176c8d37362620167cf63..72e05607b0b612807d552b4c45b58f9d9ce9c2af 100644
--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -57,8 +57,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope& scope,
            const platform::Place& dev_place) const override {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
-    auto& dev_ctx = *pool.Borrow(dev_place);
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& dev_ctx = *pool.Get(dev_place);
 
     framework::ExecutionContext ctx(*this, scope, dev_ctx);
 
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index 455fbd8ca3f5083fac51776524daca6f6a029667..e333002bfd1ab40c62882f09cd207a12a0939648 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -195,8 +195,8 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
 
 void CondOp::Run(const Scope& scope, const platform::Place& place) const {
   // get device context from pool
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
-  auto& dev_ctx = *pool.Borrow(place);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& dev_ctx = *pool.Get(place);
 
   PrepareDataForSubnet(scope, dev_ctx);
   std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index cecbb7226aedabc3bee63c518d0ab9da496fca11..48da52c3b68879a1da8550a5448090f9f1e715d3 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -49,8 +49,8 @@ class FeedOp : public framework::OperatorBase {
     auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
 
     // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     framework::CopyFrom(feed_item, place, dev_ctx, out_item);
     out_item->set_lod(feed_item.lod());
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index fa20a06540efef05d6a15669b1f8dfb8bd5927bc..387d1e0a747f71d85826b52d140c2838112227f6 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -52,8 +52,8 @@ class FetchOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): Should we assume the fetch operator always generate
     // CPU outputs?
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
     dev_ctx.Wait();
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
index fe0706c4a9da864025737584b72c02cca83c956b..dcd43a30c86b62d79f52ac640f14b295a062146c 100644
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -49,8 +49,8 @@ class FillConstantOp : public framework::OperatorBase {
       out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
     }
 
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(dev_place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
     math::set_constant(dev_ctx, &out, value);
   }
 };
diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc
index 57b4ec69384a55cef6d6d8d0c0145caae837b3f6..084ba1db62de0a6bf6829f8e9f4c274fb777e879 100644
--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
@@ -69,8 +69,9 @@ class FillOp : public framework::OperatorBase {
 
     if (!force_cpu && platform::is_gpu_place(place)) {
       // Copy tensor to out
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-      auto &dev_ctx = *pool.Borrow(place);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
       framework::CopyFrom(tensor, place, dev_ctx, &out);
     }
   }
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index 5425375c1ffa16c223ec1b05506ffb4b4394d6e1..65f021d91931541b712bd46aebc06e68144b2af0 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -40,8 +40,8 @@ class LoadOp : public framework::OperatorBase {
     auto *tensor = out_var->GetMutable<framework::LoDTensor>();
     framework::DeserializeFromStream(fin, tensor);
 
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     if (platform::is_gpu_place(place)) {
       // copy CPU to GPU
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
index ed99915bb7fc312cac955b50ffbc3237aee503b0..8d164b4abc54722a95a176dfe8ed341f8c5125d1 100644
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -88,8 +88,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
         auto slice = out[i].Slice(static_cast<int>(offset),
                                   static_cast<int>(offset + len));
 
-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-        auto &dev_ctx = *pool.Borrow(place);
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);
 
         framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
                                     static_cast<int>(each_range.end)),
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
index 2287f347910e83c25d2155b80670f9d991c1e5b2..3f999e404f8afe6bded09c820509fa0f36d30bf6 100644
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -30,8 +30,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
   void Run(const framework::Scope &scope,
            const platform::Place &dev_place) const override {
     // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(dev_place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
 
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
index 34a6e1a58d4bb8b267f3b6b3a0ac689c17036c37..6546096069d4c3fbc4908a16c2dba2ac6d7e6421 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -305,7 +305,7 @@ int main(int argc, char **argv) {
   }
 
   VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Create(places);
+  paddle::platform::DeviceContextPool::Init(places);
 
   testing::InitGoogleTest(&argc, argv);
 
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 71769e67c7032f2d808d1394883cbe93f826b2f6..056fa46949cd623845956521b068109085a8795e 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -272,8 +272,9 @@ class RecurrentOp : public RecurrentBase {
                    false /*create_local_scope*/);
 
       // get device context from pool
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-      auto &dev_ctx = *pool.Borrow(place);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
 
       // Copy inside::output -> outside::output
       //    outside::output[seq_offset: seq_offset + 1] = inside::output
@@ -326,8 +327,8 @@ class RecurrentGradOp : public RecurrentBase {
     auto *program = block->Program();
 
     // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     for (size_t step_id = 0; step_id < seq_len; ++step_id) {
       size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
index 1063388e2539d47ca6ab56cd5fb0b946b6fb9147..8d652ff806461cea3d0e8d3bd70704b4b6bc2173 100644
--- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
@@ -131,8 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
     auto x_sliced = x.Slice(x_offset, x_offset + len);
     auto out_sliced = out->Slice(out_offset, out_offset + len);
 
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
     framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
     out_offset += len;
     return out_offset;
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
index d045a8b5b8d852278fd5140b9cf8707462b93c93..4b1cbe88836e340c94f797806243a6768410ed3d 100644
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -91,8 +91,8 @@ class SaveOp : public framework::OperatorBase {
     auto &tensor = var->Get<framework::LoDTensor>();
 
     // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     framework::SerializeToStream(fout, tensor, dev_ctx);
   }
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
index e8a4773547861478b1771562ecd9e8b0bb3c3739..e5ef0740b6f385de7f17a3a419000cb8c897d986 100644
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -106,8 +106,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
     dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
 
     // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     if (dout_var == nullptr) {  // dx_tensor fill zero
       math::set_constant(dev_ctx, &dx_tensor, 0.0f);
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
index 89826ca6ee98d579f8b7c8795b6dc33cfa158ee1..2d8787d740c70f1d4696fdec381b572ecf031f57 100644
--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -45,8 +45,8 @@ class SplitLoDTensorOp : public framework::OperatorBase {
     auto &x_lod = x.lod();
     auto &mask_dim = mask.dims();
 
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(dev_place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
 
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
     if (platform::is_cpu_place(mask.place())) {
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index 9529aab573a73f55f7a67d39cde99aca023f473e..53e38ec70336ca7f2d7c142e5fb1bbe427ab2957 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -40,8 +40,9 @@ class WriteToArrayOp : public ArrayOp {
     if (x_tensor.memory_size() > 0) {
       auto *out_tensor = &out->at(offset);
 
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-      auto &dev_ctx = *pool.Borrow(place);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
 
       CopyFrom(x_tensor, place, dev_ctx, out_tensor);
       out_tensor->set_lod(x_tensor.lod());
@@ -132,8 +133,9 @@ class ReadFromArrayOp : public ArrayOp {
     auto *out_tensor = out->GetMutable<framework::LoDTensor>();
     size_t offset = GetOffset(scope, place);
     if (offset < x_array.size()) {
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-      auto &dev_ctx = *pool.Borrow(place);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
       framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor);
       out_tensor->set_lod(x_array[offset].lod());
     } else {
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index e450ef32a432a9dede05835341a166454a551889..ea07f2e002cb76d09a11f7a5305c2d45b780e7bd 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -17,7 +17,7 @@ namespace platform {
 
 DeviceContextPool* DeviceContextPool::pool = nullptr;
 
-const platform::DeviceContext* DeviceContextPool::Borrow(
+const platform::DeviceContext* DeviceContextPool::Get(
     const platform::Place& place) {
   auto it = device_contexts_.find(place);
   if (it == device_contexts_.end()) {
@@ -28,24 +28,6 @@ const platform::DeviceContext* DeviceContextPool::Borrow(
   return it->second;
 }
 
-std::vector<const platform::DeviceContext*> DeviceContextPool::Borrow(
-    const std::vector<platform::Place>& places) {
-  PADDLE_ENFORCE_GT(places.size(), 0);
-  PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
-  std::vector<const platform::DeviceContext*> borrowed_contexts;
-  for (auto& place : places) {
-    auto it = device_contexts_.find(place);
-    if (it != device_contexts_.end()) {
-      borrowed_contexts.emplace_back(it->second);
-    } else {
-      PADDLE_THROW(
-          "'Place' is not supported, Please re-compile with WITH_GPU "
-          "option");
-    }
-  }
-  return borrowed_contexts;
-}
-
 DeviceContextPool::DeviceContextPool(
     const std::vector<platform::Place>& places) {
   PADDLE_ENFORCE_GT(places.size(), 0);
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 8ba12e1657b6671b36fa9ad3f498820d61af585e..dfef2c16d8f2277d57cbcfe51d108402e518799b 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -109,13 +109,13 @@ class DeviceContextPool {
  public:
   explicit DeviceContextPool(const std::vector<platform::Place>& places);
 
-  static DeviceContextPool& Get() {
+  static DeviceContextPool& Instance() {
     PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
     return *pool;
   }
 
   /*! \brief  Create should only called by Init function */
-  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
+  static DeviceContextPool& Init(const std::vector<platform::Place>& places) {
     if (pool == nullptr) {
       pool = new DeviceContextPool(places);
     }
@@ -123,13 +123,7 @@ class DeviceContextPool {
   }
 
   /*! \brief  Return handle of single device context. */
-  const platform::DeviceContext* Borrow(const platform::Place& place);
-
-  /*! \brief  Return handle of multi-device context. */
-  std::vector<const platform::DeviceContext*> Borrow(
-      const std::vector<platform::Place>& places);
-
-  ~DeviceContextPool() {}
+  const platform::DeviceContext* Get(const platform::Place& place);
 
  private:
   static DeviceContextPool* pool;
diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu
index 91011bf71ccf4295075aa2ed73dffa92cb22d773..ca10cf34639376798bf5ba05970c9c734e5a1ef8 100644
--- a/paddle/platform/device_context_test.cu
+++ b/paddle/platform/device_context_test.cu
@@ -71,35 +71,20 @@ TEST(Device, DeviceContextPool) {
   using paddle::platform::CPUPlace;
   using paddle::platform::CUDAPlace;
 
-  DeviceContextPool& pool = DeviceContextPool::Get();
-  auto cpu_dev_ctx1 = pool.Borrow(CPUPlace());
-  auto cpu_dev_ctx2 = pool.Borrow(CPUPlace());
-  EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1);
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto cpu_dev_ctx1 = pool.Get(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Get(CPUPlace());
+  ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
 
   std::vector<Place> gpu_places;
   int count = paddle::platform::GetCUDADeviceCount();
   for (int i = 0; i < count; ++i) {
-    gpu_places.emplace_back(CUDAPlace(i));
-  }
-  auto dev_ctxs = pool.Borrow(gpu_places);
-  for (size_t i = 0; i < dev_ctxs.size(); ++i) {
-    auto* dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctxs[i]);
-
-    // check same as CUDAPlace(i)
-    CUDAPlace place = boost::get<CUDAPlace>(dev_ctx->GetPlace());
-    EXPECT_EQ(place.GetDeviceId(), static_cast<int>(i));
+    auto dev_ctx = pool.Get(CUDAPlace(i));
+    ASSERT_NE(dev_ctx, nullptr);
   }
 }
 
 int main(int argc, char** argv) {
-  int dev_count = paddle::platform::GetCUDADeviceCount();
-  if (dev_count <= 1) {
-    LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA "
-                    "device count is "
-                 << dev_count;
-    return 0;
-  }
-
   std::vector<paddle::platform::Place> places;
 
   places.emplace_back(paddle::platform::CPUPlace());
@@ -109,7 +94,7 @@ int main(int argc, char** argv) {
   }
 
   VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Create(places);
+  paddle::platform::DeviceContextPool::Init(places);
 
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu
index 8f815863a72b118a0a258a2aa4276a33b95b0e46..ef6d845874745af1150e4425f8d6be416cc44ece 100644
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -144,7 +144,7 @@ int main(int argc, char** argv) {
   }
 
   VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Create(places);
+  paddle::platform::DeviceContextPool::Init(places);
 
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 67244d82602906231ac1fc870adccc7e82869407..4d5e73e2c266b301de4f19e09be7ab4009c936d3 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -63,9 +63,10 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
             tensor.dims(), platform::CPUPlace()));
 
-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
         auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
-            pool.Borrow(tensor.place()));
+            pool.Get(tensor.place()));
 
         paddle::platform::GpuMemcpyAsync(
             dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
@@ -137,9 +138,9 @@ void PyCUDATensorSetFromArray(
   self.Resize(framework::make_ddim(dims));
   auto *dst = self.mutable_data<T>(place);
 
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto dev_ctx =
-      static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place));
+      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
   paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
                                    cudaMemcpyHostToDevice, dev_ctx->stream());
 }