diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 7b7582380cbfc6edd8eef5dcb58722c5dabfc9e7..2a87f02bd566f6b6d8326827a9cd5f0ae648de6c 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -7,16 +7,12 @@ if(WITH_GPU)
         dynload_cuda)
     nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 endif()
-
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
-
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 
 if(WITH_GPU)
     set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
-    nv_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
-            device_context broadcast_op_handle)
 else()
     set(multi_devices_graph_builder_deps)
 endif()
@@ -25,3 +21,6 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
+
+cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
+        device_context broadcast_op_handle)
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index a782ebf8fd3aec66d407f04ac041694ae82a8811..2c99a347bf520da66c19db1812cdfbd2ca23babf 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -29,13 +29,8 @@ Tensor *GetTensorFromVar(Variable *in_var) {
   return nullptr;
 }
 BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
-                                     const std::vector<platform::Place> &places,
-                                     const platform::ContextMap &ctxs)
-    : local_scopes_(local_scopes), places_(places), ctxs_(ctxs) {
-  for (auto &p : places_) {
-    this->dev_ctxes_[p] = ctxs_.DevCtx(p);
-  }
-}
+                                     const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
 
 void BroadcastOpHandle::RunImpl() {
   PADDLE_ENFORCE_EQ(this->inputs_.size(), 1);
@@ -47,26 +42,18 @@ void BroadcastOpHandle::RunImpl() {
   if (inputs_[0]->generated_op_)
     inputs_[0]->generated_op_->Wait(dev_ctxes_[in_place]);
 
-  auto iter = std::find(places_.begin(), places_.end(), in_place);
-  if (iter == places_.end()) {
-    PADDLE_THROW("The input of BCast is not in the places_.");
-  }
-
-  int offset = iter - places_.begin();
-  auto in_var = local_scopes_[offset]->FindVar(in_var_handle->name_);
+  auto in_scope_idx = in_var_handle->scope_idx_;
+  PADDLE_ENFORCE_LT(in_scope_idx, local_scopes_.size(), "");
+  auto in_var = local_scopes_[in_scope_idx]->FindVar(in_var_handle->name_);
 
   Tensor *in_tensor = GetTensorFromVar(in_var);
   for (auto *out : outputs_) {
     auto out_handle = static_cast<VarHandle *>(out);
     auto &out_p = out_handle->place_;
 
-    auto iter = std::find(places_.begin(), places_.end(), out_p);
-    if (iter == places_.end()) {
-      PADDLE_THROW("The output of BCast is not in the places_.");
-    }
-    int offset = iter - places_.begin();
-
-    auto *s = local_scopes_[offset];
+    auto out_scope_idx = out_handle->scope_idx_;
+    PADDLE_ENFORCE_LT(out_scope_idx, local_scopes_.size(), "");
+    auto *s = local_scopes_[out_scope_idx];
     auto out_var = s->FindVar(out_handle->name_);
 
     PADDLE_ENFORCE_EQ(out_var->Type(), in_var->Type(), "");
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index a571af1218f4e2ace673fc39470b6fe3f6241c49..06ec164ce083d0a7b4607686dcdb3dbeb5755fec 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -35,11 +35,10 @@ namespace details {
 struct BroadcastOpHandle : public OpHandleBase {
   const std::vector<Scope *> &local_scopes_;
   const std::vector<platform::Place> &places_;
-  const platform::ContextMap &ctxs_;
+  //  const platform::ContextMap &ctxs_;
 
   BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
-                    const std::vector<platform::Place> &places,
-                    const platform::ContextMap &ctxs);
+                    const std::vector<platform::Place> &places);
 
   std::string Name() const override;
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index fd671ded217d489c3a9c05d88f13ec67745fc131..d03115f0be6901c8b409efe229daa87b9cbee9a1 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -25,51 +25,66 @@ const f::DDim kDims = {20, 20};
 
 class BroadcastTester : public ::testing::Test {
  public:
-  void SetUp() override {
-    int count = p::GetCUDADeviceCount();
-    if (count <= 1) {
-      LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
-                      "device count is "
-                   << count;
-      exit(0);
+  void InitCtx(bool use_gpu) {
+    if (use_gpu) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
     }
-    for (int i = 0; i < count; ++i) {
-      gpu_list_.emplace_back(p::CUDAPlace(i));
-    }
-    ctxs_ = new p::ContextMap(gpu_list_);
   }
 
   template <class T>
-  void BroadcastInitOp(int gpu_id = 0) {
+  void BroadcastInitOp(int input_scope_idx) {
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       local_scope_.push_back(&g_scope_.NewScope());
       auto* out_var = local_scope_[j]->Var("out");
       out_var->GetMutable<T>();
     }
-    auto* in_var = local_scope_[gpu_id]->Var("input");
+    auto* in_var = local_scope_[input_scope_idx]->Var("input");
     in_var->GetMutable<T>();
 
-    bc_op_handle_ =
-        new f::details::BroadcastOpHandle(local_scope_, gpu_list_, *ctxs_);
+    bc_op_handle_ = new f::details::BroadcastOpHandle(local_scope_, gpu_list_);
 
     f::details::VarHandle* in_var_handle = new f::details::VarHandle();
-    in_var_handle->place_ = gpu_list_[gpu_id];
+    in_var_handle->place_ = gpu_list_[input_scope_idx];
     in_var_handle->name_ = "input";
     in_var_handle->version_ = 1;
+    in_var_handle->scope_idx_ = input_scope_idx;
     in_var_handle->generated_op_ = nullptr;
     bc_op_handle_->AddInput(in_var_handle);
 
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      bc_op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j];
       f::details::VarHandle* out_var_handle = new f::details::VarHandle();
       out_var_handle->place_ = gpu_list_[j];
       out_var_handle->name_ = "out";
       out_var_handle->version_ = 2;
+      out_var_handle->scope_idx_ = j;
       out_var_handle->generated_op_ = bc_op_handle_;
       bc_op_handle_->AddOutput(out_var_handle);
     }
   }
   void BroadcastDestroy() {
-    delete ctxs_;
     for (auto in : bc_op_handle_->inputs_) {
       delete in;
     }
@@ -77,98 +92,131 @@ class BroadcastTester : public ::testing::Test {
       delete out;
     }
     delete bc_op_handle_;
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      delete ctxs_[j];
+    }
   }
 
- public:
-  f::Scope g_scope_;
-  p::ContextMap* ctxs_;
-  std::vector<f::Scope*> local_scope_;
-  std::vector<p::Place> gpu_list_;
-  f::details::BroadcastOpHandle* bc_op_handle_;
-};
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+  }
 
-TEST_F(BroadcastTester, BroadcastTestLodTensor) {
-  int gpu_id = 0;
-  BroadcastInitOp<f::LoDTensor>(gpu_id);
+  void TestBroadcastLodTensor() {
+    int input_scope_idx = 0;
+    BroadcastInitOp<f::LoDTensor>(input_scope_idx);
 
-  auto in_var = local_scope_[gpu_id]->Var("input");
-  auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
-  in_lod_tensor->mutable_data<float>(kDims, gpu_list_[gpu_id]);
+    auto in_var = local_scope_[input_scope_idx]->Var("input");
+    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
+    in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
 
-  std::vector<float> send_vector(f::product(kDims), gpu_id + 12);
-  for (size_t k = 0; k < send_vector.size(); ++k) {
-    send_vector[k] = k;
-  }
-  f::LoD lod{{0, 10, 20}};
-  paddle::framework::TensorFromVector<float>(
-      send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), in_lod_tensor);
-  in_lod_tensor->set_lod(lod);
-  bc_op_handle_->Run(false);
-
-  ctxs_->WaitAll();
-
-  p::CPUPlace cpu_place;
-  for (size_t j = 0; j < gpu_list_.size(); ++j) {
-    auto out_var = local_scope_[j]->Var("out");
-    auto out_tensor = out_var->Get<f::LoDTensor>();
-    PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
-
-    f::Tensor result_tensor;
-    f::TensorCopy(out_tensor, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor);
-    float* ct = result_tensor.mutable_data<float>(cpu_place);
-
-    for (int64_t j = 0; j < f::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+    std::vector<float> send_vector(f::product(kDims), input_scope_idx + 12);
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
     }
-  }
+    f::LoD lod{{0, 10, 20}};
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
+    in_lod_tensor->set_lod(lod);
 
-  BroadcastDestroy();
-}
+    bc_op_handle_->Run(false);
 
-TEST_F(BroadcastTester, BroadcastTestSelectedRows) {
-  int gpu_id = 0;
-  BroadcastInitOp<f::SelectedRows>(gpu_id);
-
-  auto in_var = local_scope_[gpu_id]->Var("input");
-  auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
-  auto value = in_selected_rows->mutable_value();
-  value->mutable_data<float>(kDims, gpu_list_[gpu_id]);
-  int height = kDims[0] * 2;
-  std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
-                            2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
-  in_selected_rows->set_height(height);
-  in_selected_rows->set_rows(rows);
-
-  std::vector<float> send_vector(f::product(kDims));
-  for (size_t k = 0; k < send_vector.size(); ++k) {
-    send_vector[k] = k;
-  }
-  paddle::framework::TensorFromVector<float>(
-      send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), value);
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      auto out_var = local_scope_[j]->Var("out");
+      auto out_tensor = out_var->Get<f::LoDTensor>();
+      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
 
-  bc_op_handle_->Run(false);
+      f::Tensor result_tensor;
+      f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor);
+      float* ct = result_tensor.mutable_data<float>(cpu_place);
 
-  ctxs_->WaitAll();
+      for (int64_t j = 0; j < f::product(kDims); ++j) {
+        ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+      }
+    }
 
-  p::CPUPlace cpu_place;
-  for (size_t j = 0; j < gpu_list_.size(); ++j) {
-    auto out_var = local_scope_[j]->Var("out");
-    auto& out_select_rows = out_var->Get<f::SelectedRows>();
-    auto rt = out_select_rows.value();
+    BroadcastDestroy();
+  }
 
-    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
-    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
+  void TestBroadcastSelectedRows() {
+    int input_scope_idx = 0;
+    BroadcastInitOp<f::SelectedRows>(input_scope_idx);
+
+    auto in_var = local_scope_[input_scope_idx]->Var("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+    auto value = in_selected_rows->mutable_value();
+    value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+    int height = kDims[0] * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    in_selected_rows->set_height(height);
+    in_selected_rows->set_rows(rows);
+
+    std::vector<float> send_vector(f::product(kDims));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
     }
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), value);
+
+    bc_op_handle_->Run(false);
 
-    f::Tensor result_tensor;
-    f::TensorCopy(rt, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor);
-    float* ct = result_tensor.data<float>();
+    WaitAll();
 
-    for (int64_t j = 0; j < f::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+    p::CPUPlace cpu_place;
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      auto out_var = local_scope_[j]->Var("out");
+      auto& out_select_rows = out_var->Get<f::SelectedRows>();
+      auto rt = out_select_rows.value();
+
+      PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
+                        "height is not equal.");
+      for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+        PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
+      }
+
+      f::Tensor result_tensor;
+      f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor);
+      float* ct = result_tensor.data<float>();
+
+      for (int64_t j = 0; j < f::product(kDims); ++j) {
+        ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+      }
     }
+
+    BroadcastDestroy();
   }
 
-  BroadcastDestroy();
+ public:
+  f::Scope g_scope_;
+  std::vector<p::DeviceContext*> ctxs_;
+  std::vector<f::Scope*> local_scope_;
+  std::vector<p::Place> gpu_list_;
+  f::details::BroadcastOpHandle* bc_op_handle_;
+};
+
+TEST_F(BroadcastTester, TestCPUBroadcastTestLodTensor) {
+  InitCtx(false);
+  TestBroadcastLodTensor();
+}
+
+TEST_F(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
+  InitCtx(false);
+  TestBroadcastSelectedRows();
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST_F(BroadcastTester, TestGPUBroadcastTestLodTensor) {
+  InitCtx(true);
+  TestBroadcastLodTensor();
+}
+
+TEST_F(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
+  InitCtx(true);
+  TestBroadcastSelectedRows();
 }
+#endif
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 569dda17c6e91d5658c4f8b9ba0b8c8fbd966832..871e41343f53b801a22d3a450f0906f37fb372d1 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -50,6 +50,7 @@ struct VarHandle : public VarHandleBase {
   // version field currently is not used, however, just store the version to
   // debug easily.
   size_t version_;
+  size_t scope_idx_;
   std::string name_;
   platform::Place place_;
 };
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index fceb5845ff2cb21912fc31db5a43abb34d6d1ec5..39ef082266d618ec9352a44b999c8f4f003b2616 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -2,21 +2,19 @@
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
-
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
@@ -140,45 +138,6 @@ template <>
 struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
   using TYPE = CUDAPinnedDeviceContext;
 };
-
-class ContextMap {
- public:
-  explicit ContextMap(const std::vector<platform::Place>& places) {
-    order_.reserve(places.size());
-    for (auto& p : places) {
-      auto dev = boost::get<CUDAPlace>(p);
-      int dev_id = dev.device;
-      order_.emplace_back(dev_id);
-      contexts_[dev_id].reset(new CUDADeviceContext(dev));
-    }
-    PADDLE_ENFORCE_EQ(
-        order_.size(), contexts_.size(),
-        "Context Map does not support contain two or more same device");
-  }
-
-  DeviceContext* DevCtx(int dev_id) const { return at(dev_id); }
-
-  DeviceContext* DevCtx(platform::Place p) const {
-    return DevCtx(boost::get<CUDAPlace>(p).device);
-  }
-
-  DeviceContext* at(platform::Place p) const {
-    return this->at(boost::get<CUDAPlace>(p).device);
-  }
-
-  DeviceContext* at(int dev_id) const { return contexts_.at(dev_id).get(); }
-
-  void WaitAll() {
-    for (auto& p : contexts_) {
-      p.second->Wait();
-    }
-  }
-
- private:
-  std::unordered_map<int, std::unique_ptr<DeviceContext>> contexts_;
-  std::vector<int> order_;
-};
-
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN