From 124c93081d26a89b677823a7e2d74260c579fb54 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 11 Apr 2018 14:39:00 +0800
Subject: [PATCH] remove ContextMap

---
 paddle/fluid/framework/details/CMakeLists.txt |   7 +-
 .../framework/details/broadcast_op_handle.cc  |  29 +--
 .../framework/details/broadcast_op_handle.h   |   5 +-
 .../details/broadcast_op_handle_test.cc       | 234 +++++++++++-------
 paddle/fluid/framework/details/var_handle.h   |   1 +
 paddle/fluid/platform/device_context.h        |  45 +---
 6 files changed, 157 insertions(+), 164 deletions(-)
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 7b7582380cb..2a87f02bd56 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -7,16 +7,12 @@ if(WITH_GPU)
         dynload_cuda)
     nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 endif()
-
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
-
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 
 if(WITH_GPU)
     set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
-    nv_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
-            device_context broadcast_op_handle)
 else()
     set(multi_devices_graph_builder_deps)
 endif()
@@ -25,3 +21,6 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
+
+cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
+        device_context broadcast_op_handle)
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index a782ebf8fd3..2c99a347bf5 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -29,13 +29,8 @@ Tensor *GetTensorFromVar(Variable *in_var) {
   return nullptr;
 }
 BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
-                                     const std::vector<platform::Place> &places,
-                                     const platform::ContextMap &ctxs)
-    : local_scopes_(local_scopes), places_(places), ctxs_(ctxs) {
-  for (auto &p : places_) {
-    this->dev_ctxes_[p] = ctxs_.DevCtx(p);
-  }
-}
+                                     const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
 
 void BroadcastOpHandle::RunImpl() {
   PADDLE_ENFORCE_EQ(this->inputs_.size(), 1);
@@ -47,26 +42,18 @@ void BroadcastOpHandle::RunImpl() {
   if (inputs_[0]->generated_op_)
     inputs_[0]->generated_op_->Wait(dev_ctxes_[in_place]);
 
-  auto iter = std::find(places_.begin(), places_.end(), in_place);
-  if (iter == places_.end()) {
-    PADDLE_THROW("The input of BCast is not in the places_.");
-  }
-
-  int offset = iter - places_.begin();
-  auto in_var = local_scopes_[offset]->FindVar(in_var_handle->name_);
+  auto in_scope_idx = in_var_handle->scope_idx_;
+  PADDLE_ENFORCE_LT(in_scope_idx, local_scopes_.size(), "");
+  auto in_var = local_scopes_[in_scope_idx]->FindVar(in_var_handle->name_);
 
   Tensor *in_tensor = GetTensorFromVar(in_var);
   for (auto *out : outputs_) {
     auto out_handle = static_cast<VarHandle *>(out);
     auto &out_p = out_handle->place_;
 
-    auto iter = std::find(places_.begin(), places_.end(), out_p);
-    if (iter == places_.end()) {
-      PADDLE_THROW("The output of BCast is not in the places_.");
-    }
-    int offset = iter - places_.begin();
-
-    auto *s = local_scopes_[offset];
+    auto out_scope_idx = out_handle->scope_idx_;
+    PADDLE_ENFORCE_LT(out_scope_idx, local_scopes_.size(), "");
+    auto *s = local_scopes_[out_scope_idx];
     auto out_var = s->FindVar(out_handle->name_);
 
     PADDLE_ENFORCE_EQ(out_var->Type(), in_var->Type(), "");
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index a571af1218f..06ec164ce08 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -35,11 +35,10 @@ namespace details {
 struct BroadcastOpHandle : public OpHandleBase {
   const std::vector<Scope *> &local_scopes_;
   const std::vector<platform::Place> &places_;
-  const platform::ContextMap &ctxs_;
+  //  const platform::ContextMap &ctxs_;
 
   BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
-                    const std::vector<platform::Place> &places,
-                    const platform::ContextMap &ctxs);
+                    const std::vector<platform::Place> &places);
 
   std::string Name() const override;
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index fd671ded217..d03115f0be6 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -25,51 +25,66 @@ const f::DDim kDims = {20, 20};
 
 class BroadcastTester : public ::testing::Test {
  public:
-  void SetUp() override {
-    int count = p::GetCUDADeviceCount();
-    if (count <= 1) {
-      LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
-                      "device count is "
-                   << count;
-      exit(0);
+  void InitCtx(bool use_gpu) {
+    if (use_gpu) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
     }
-    for (int i = 0; i < count; ++i) {
-      gpu_list_.emplace_back(p::CUDAPlace(i));
-    }
-    ctxs_ = new p::ContextMap(gpu_list_);
   }
 
   template <class T>
-  void BroadcastInitOp(int gpu_id = 0) {
+  void BroadcastInitOp(int input_scope_idx) {
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       local_scope_.push_back(&g_scope_.NewScope());
       auto* out_var = local_scope_[j]->Var("out");
       out_var->GetMutable<T>();
     }
-    auto* in_var = local_scope_[gpu_id]->Var("input");
+    auto* in_var = local_scope_[input_scope_idx]->Var("input");
     in_var->GetMutable<T>();
 
-    bc_op_handle_ =
-        new f::details::BroadcastOpHandle(local_scope_, gpu_list_, *ctxs_);
+    bc_op_handle_ = new f::details::BroadcastOpHandle(local_scope_, gpu_list_);
 
     f::details::VarHandle* in_var_handle = new f::details::VarHandle();
-    in_var_handle->place_ = gpu_list_[gpu_id];
+    in_var_handle->place_ = gpu_list_[input_scope_idx];
     in_var_handle->name_ = "input";
     in_var_handle->version_ = 1;
+    in_var_handle->scope_idx_ = input_scope_idx;
     in_var_handle->generated_op_ = nullptr;
     bc_op_handle_->AddInput(in_var_handle);
 
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      bc_op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j];
       f::details::VarHandle* out_var_handle = new f::details::VarHandle();
       out_var_handle->place_ = gpu_list_[j];
       out_var_handle->name_ = "out";
       out_var_handle->version_ = 2;
+      out_var_handle->scope_idx_ = j;
       out_var_handle->generated_op_ = bc_op_handle_;
       bc_op_handle_->AddOutput(out_var_handle);
     }
   }
   void BroadcastDestroy() {
-    delete ctxs_;
     for (auto in : bc_op_handle_->inputs_) {
       delete in;
     }
@@ -77,98 +92,131 @@ class BroadcastTester : public ::testing::Test {
       delete out;
     }
     delete bc_op_handle_;
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      delete ctxs_[j];
+    }
   }
 
- public:
-  f::Scope g_scope_;
-  p::ContextMap* ctxs_;
-  std::vector<f::Scope*> local_scope_;
-  std::vector<p::Place> gpu_list_;
-  f::details::BroadcastOpHandle* bc_op_handle_;
-};
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+  }
 
-TEST_F(BroadcastTester, BroadcastTestLodTensor) {
-  int gpu_id = 0;
-  BroadcastInitOp<f::LoDTensor>(gpu_id);
+  void TestBroadcastLodTensor() {
+    int input_scope_idx = 0;
+    BroadcastInitOp<f::LoDTensor>(input_scope_idx);
 
-  auto in_var = local_scope_[gpu_id]->Var("input");
-  auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
-  in_lod_tensor->mutable_data<float>(kDims, gpu_list_[gpu_id]);
+    auto in_var = local_scope_[input_scope_idx]->Var("input");
+    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
+    in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
 
-  std::vector<float> send_vector(f::product(kDims), gpu_id + 12);
-  for (size_t k = 0; k < send_vector.size(); ++k) {
-    send_vector[k] = k;
-  }
-  f::LoD lod{{0, 10, 20}};
-  paddle::framework::TensorFromVector<float>(
-      send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), in_lod_tensor);
-  in_lod_tensor->set_lod(lod);
-  bc_op_handle_->Run(false);
-
-  ctxs_->WaitAll();
-
-  p::CPUPlace cpu_place;
-  for (size_t j = 0; j < gpu_list_.size(); ++j) {
-    auto out_var = local_scope_[j]->Var("out");
-    auto out_tensor = out_var->Get<f::LoDTensor>();
-    PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
-
-    f::Tensor result_tensor;
-    f::TensorCopy(out_tensor, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor);
-    float* ct = result_tensor.mutable_data<float>(cpu_place);
-
-    for (int64_t j = 0; j < f::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+    std::vector<float> send_vector(f::product(kDims), input_scope_idx + 12);
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
     }
-  }
+    f::LoD lod{{0, 10, 20}};
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
+    in_lod_tensor->set_lod(lod);
 
-  BroadcastDestroy();
-}
+    bc_op_handle_->Run(false);
 
-TEST_F(BroadcastTester, BroadcastTestSelectedRows) {
-  int gpu_id = 0;
-  BroadcastInitOp<f::SelectedRows>(gpu_id);
-
-  auto in_var = local_scope_[gpu_id]->Var("input");
-  auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
-  auto value = in_selected_rows->mutable_value();
-  value->mutable_data<float>(kDims, gpu_list_[gpu_id]);
-  int height = kDims[0] * 2;
-  std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
-                            2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
-  in_selected_rows->set_height(height);
-  in_selected_rows->set_rows(rows);
-
-  std::vector<float> send_vector(f::product(kDims));
-  for (size_t k = 0; k < send_vector.size(); ++k) {
-    send_vector[k] = k;
-  }
-  paddle::framework::TensorFromVector<float>(
-      send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), value);
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      auto out_var = local_scope_[j]->Var("out");
+      auto out_tensor = out_var->Get<f::LoDTensor>();
+      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
 
-  bc_op_handle_->Run(false);
+      f::Tensor result_tensor;
+      f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor);
+      float* ct = result_tensor.mutable_data<float>(cpu_place);
 
-  ctxs_->WaitAll();
+      for (int64_t j = 0; j < f::product(kDims); ++j) {
+        ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+      }
+    }
 
-  p::CPUPlace cpu_place;
-  for (size_t j = 0; j < gpu_list_.size(); ++j) {
-    auto out_var = local_scope_[j]->Var("out");
-    auto& out_select_rows = out_var->Get<f::SelectedRows>();
-    auto rt = out_select_rows.value();
+    BroadcastDestroy();
+  }
 
-    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
-    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
+  void TestBroadcastSelectedRows() {
+    int input_scope_idx = 0;
+    BroadcastInitOp<f::SelectedRows>(input_scope_idx);
+
+    auto in_var = local_scope_[input_scope_idx]->Var("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+    auto value = in_selected_rows->mutable_value();
+    value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+    int height = kDims[0] * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    in_selected_rows->set_height(height);
+    in_selected_rows->set_rows(rows);
+
+    std::vector<float> send_vector(f::product(kDims));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
     }
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), value);
+
+    bc_op_handle_->Run(false);
 
-    f::Tensor result_tensor;
-    f::TensorCopy(rt, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor);
-    float* ct = result_tensor.data<float>();
+    WaitAll();
 
-    for (int64_t j = 0; j < f::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+    p::CPUPlace cpu_place;
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      auto out_var = local_scope_[j]->Var("out");
+      auto& out_select_rows = out_var->Get<f::SelectedRows>();
+      auto rt = out_select_rows.value();
+
+      PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
+                        "height is not equal.");
+      for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+        PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
+      }
+
+      f::Tensor result_tensor;
+      f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor);
+      float* ct = result_tensor.data<float>();
+
+      for (int64_t j = 0; j < f::product(kDims); ++j) {
+        ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+      }
     }
+
+    BroadcastDestroy();
   }
 
-  BroadcastDestroy();
+ public:
+  f::Scope g_scope_;
+  std::vector<p::DeviceContext*> ctxs_;
+  std::vector<f::Scope*> local_scope_;
+  std::vector<p::Place> gpu_list_;
+  f::details::BroadcastOpHandle* bc_op_handle_;
+};
+
+TEST_F(BroadcastTester, TestCPUBroadcastTestLodTensor) {
+  InitCtx(false);
+  TestBroadcastLodTensor();
+}
+
+TEST_F(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
+  InitCtx(false);
+  TestBroadcastSelectedRows();
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST_F(BroadcastTester, TestGPUBroadcastTestLodTensor) {
+  InitCtx(true);
+  TestBroadcastLodTensor();
+}
+
+TEST_F(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
+  InitCtx(true);
+  TestBroadcastSelectedRows();
 }
+#endif
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 569dda17c6e..871e41343f5 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -50,6 +50,7 @@ struct VarHandle : public VarHandleBase {
   // version field currently is not used, however, just store the version to
   // debug easily.
   size_t version_;
+  size_t scope_idx_;
   std::string name_;
   platform::Place place_;
 };
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index fceb5845ff2..39ef082266d 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -2,21 +2,19 @@
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
-
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
@@ -140,45 +138,6 @@ template <>
 struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
   using TYPE = CUDAPinnedDeviceContext;
 };
-
-class ContextMap {
- public:
-  explicit ContextMap(const std::vector<platform::Place>& places) {
-    order_.reserve(places.size());
-    for (auto& p : places) {
-      auto dev = boost::get<CUDAPlace>(p);
-      int dev_id = dev.device;
-      order_.emplace_back(dev_id);
-      contexts_[dev_id].reset(new CUDADeviceContext(dev));
-    }
-    PADDLE_ENFORCE_EQ(
-        order_.size(), contexts_.size(),
-        "Context Map does not support contain two or more same device");
-  }
-
-  DeviceContext* DevCtx(int dev_id) const { return at(dev_id); }
-
-  DeviceContext* DevCtx(platform::Place p) const {
-    return DevCtx(boost::get<CUDAPlace>(p).device);
-  }
-
-  DeviceContext* at(platform::Place p) const {
-    return this->at(boost::get<CUDAPlace>(p).device);
-  }
-
-  DeviceContext* at(int dev_id) const { return contexts_.at(dev_id).get(); }
-
-  void WaitAll() {
-    for (auto& p : contexts_) {
-      p.second->Wait();
-    }
-  }
-
- private:
-  std::unordered_map<int, std::unique_ptr<DeviceContext>> contexts_;
-  std::vector<int> order_;
-};
-
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
-- 
GitLab