remove ContextMap

124c9308 · chengduoZH · 6db96ec2 · 124c9308 · 124c9308 · 124c9308
6 changed file
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -7,16 +7,12 @@ if(WITH_GPU)
        dynload_cuda)
    nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 endif()
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 if(WITH_GPU)
    set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
-    nv_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
-            device_context broadcast_op_handle)
 else()
    set(multi_devices_graph_builder_deps)
 endif()
@@ -25,3 +21,6 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)
+cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
+        device_context broadcast_op_handle)
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -29,13 +29,8 @@ Tensor *GetTensorFromVar(Variable *in_var) {
  return nullptr;
 }
 BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
-                                     const std::vector<platform::Place> &places,
+                                     const std::vector<platform::Place> &places)
-                                     const platform::ContextMap &ctxs)
+    : local_scopes_(local_scopes), places_(places) {}
-    : local_scopes_(local_scopes), places_(places), ctxs_(ctxs) {
-  for (auto &p : places_) {
-    this->dev_ctxes_[p] = ctxs_.DevCtx(p);
-  }
-}
 void BroadcastOpHandle::RunImpl() {
  PADDLE_ENFORCE_EQ(this->inputs_.size(), 1);
@@ -47,26 +42,18 @@ void BroadcastOpHandle::RunImpl() {
  if (inputs_[0]->generated_op_)
    inputs_[0]->generated_op_->Wait(dev_ctxes_[in_place]);
-  auto iter = std::find(places_.begin(), places_.end(), in_place);
+  auto in_scope_idx = in_var_handle->scope_idx_;
-  if (iter == places_.end()) {
+  PADDLE_ENFORCE_LT(in_scope_idx, local_scopes_.size(), "");
-    PADDLE_THROW("The input of BCast is not in the places_.");
+  auto in_var = local_scopes_[in_scope_idx]->FindVar(in_var_handle->name_);
-  }
-  int offset = iter - places_.begin();
-  auto in_var = local_scopes_[offset]->FindVar(in_var_handle->name_);
  Tensor *in_tensor = GetTensorFromVar(in_var);
  for (auto *out : outputs_) {
    auto out_handle = static_cast<VarHandle *>(out);
    auto &out_p = out_handle->place_;
-    auto iter = std::find(places_.begin(), places_.end(), out_p);
+    auto out_scope_idx = out_handle->scope_idx_;
-    if (iter == places_.end()) {
+    PADDLE_ENFORCE_LT(out_scope_idx, local_scopes_.size(), "");
-      PADDLE_THROW("The output of BCast is not in the places_.");
+    auto *s = local_scopes_[out_scope_idx];
-    }
-    int offset = iter - places_.begin();
-    auto *s = local_scopes_[offset];
    auto out_var = s->FindVar(out_handle->name_);
    PADDLE_ENFORCE_EQ(out_var->Type(), in_var->Type(), "");

--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -35,11 +35,10 @@ namespace details {
 struct BroadcastOpHandle : public OpHandleBase {
  const std::vector<Scope *> &local_scopes_;
  const std::vector<platform::Place> &places_;
-  const platform::ContextMap &ctxs_;
+  //  const platform::ContextMap &ctxs_;
  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
-                    const std::vector<platform::Place> &places,
+                    const std::vector<platform::Place> &places);
-                    const platform::ContextMap &ctxs);
  std::string Name() const override;

--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -25,51 +25,66 @@ const f::DDim kDims = {20, 20};
 class BroadcastTester : public ::testing::Test {
 public:
-  void SetUp() override {
+  void InitCtx(bool use_gpu) {
-    int count = p::GetCUDADeviceCount();
+    if (use_gpu) {
-    if (count <= 1) {
+#ifdef PADDLE_WITH_CUDA
-      LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+      int count = p::GetCUDADeviceCount();
-                      "device count is "
+      if (count <= 1) {
-                   << count;
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
-      exit(0);
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
    }
-    for (int i = 0; i < count; ++i) {
-      gpu_list_.emplace_back(p::CUDAPlace(i));
-    }
-    ctxs_ = new p::ContextMap(gpu_list_);
  }
  template <class T>
-  void BroadcastInitOp(int gpu_id = 0) {
+  void BroadcastInitOp(int input_scope_idx) {
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      local_scope_.push_back(&g_scope_.NewScope());
      auto* out_var = local_scope_[j]->Var("out");
      out_var->GetMutable<T>();
    }
-    auto* in_var = local_scope_[gpu_id]->Var("input");
+    auto* in_var = local_scope_[input_scope_idx]->Var("input");
    in_var->GetMutable<T>();
-    bc_op_handle_ =
+    bc_op_handle_ = new f::details::BroadcastOpHandle(local_scope_, gpu_list_);
-        new f::details::BroadcastOpHandle(local_scope_, gpu_list_, *ctxs_);
    f::details::VarHandle* in_var_handle = new f::details::VarHandle();
-    in_var_handle->place_ = gpu_list_[gpu_id];
+    in_var_handle->place_ = gpu_list_[input_scope_idx];
    in_var_handle->name_ = "input";
    in_var_handle->version_ = 1;
+    in_var_handle->scope_idx_ = input_scope_idx;
    in_var_handle->generated_op_ = nullptr;
    bc_op_handle_->AddInput(in_var_handle);
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      bc_op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j];
      f::details::VarHandle* out_var_handle = new f::details::VarHandle();
      out_var_handle->place_ = gpu_list_[j];
      out_var_handle->name_ = "out";
      out_var_handle->version_ = 2;
+      out_var_handle->scope_idx_ = j;
      out_var_handle->generated_op_ = bc_op_handle_;
      bc_op_handle_->AddOutput(out_var_handle);
    }
  }
  void BroadcastDestroy() {
-    delete ctxs_;
    for (auto in : bc_op_handle_->inputs_) {
      delete in;
    }
@@ -77,98 +92,131 @@ class BroadcastTester : public ::testing::Test {
      delete out;
    }
    delete bc_op_handle_;
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      delete ctxs_[j];
+    }
  }
- public:
+  void WaitAll() {
-  f::Scope g_scope_;
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
-  p::ContextMap* ctxs_;
+      ctxs_[j]->Wait();
-  std::vector<f::Scope*> local_scope_;
+    }
-  std::vector<p::Place> gpu_list_;
+  }
-  f::details::BroadcastOpHandle* bc_op_handle_;
-};
-TEST_F(BroadcastTester, BroadcastTestLodTensor) {
+  void TestBroadcastLodTensor() {
-  int gpu_id = 0;
+    int input_scope_idx = 0;
-  BroadcastInitOp<f::LoDTensor>(gpu_id);
+    BroadcastInitOp<f::LoDTensor>(input_scope_idx);
-  auto in_var = local_scope_[gpu_id]->Var("input");
+    auto in_var = local_scope_[input_scope_idx]->Var("input");
-  auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
+    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
-  in_lod_tensor->mutable_data<float>(kDims, gpu_list_[gpu_id]);
+    in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
-  std::vector<float> send_vector(f::product(kDims), gpu_id + 12);
+    std::vector<float> send_vector(f::product(kDims), input_scope_idx + 12);
-  for (size_t k = 0; k < send_vector.size(); ++k) {
+    for (size_t k = 0; k < send_vector.size(); ++k) {
-    send_vector[k] = k;
+      send_vector[k] = k;
-  }
-  f::LoD lod{{0, 10, 20}};
-  paddle::framework::TensorFromVector<float>(
-      send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), in_lod_tensor);
-  in_lod_tensor->set_lod(lod);
-  bc_op_handle_->Run(false);
-  ctxs_->WaitAll();
-  p::CPUPlace cpu_place;
-  for (size_t j = 0; j < gpu_list_.size(); ++j) {
-    auto out_var = local_scope_[j]->Var("out");
-    auto out_tensor = out_var->Get<f::LoDTensor>();
-    PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
-    f::Tensor result_tensor;
-    f::TensorCopy(out_tensor, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor);
-    float* ct = result_tensor.mutable_data<float>(cpu_place);
-    for (int64_t j = 0; j < f::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
    }
-  }
+    f::LoD lod{{0, 10, 20}};
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
+    in_lod_tensor->set_lod(lod);
-  BroadcastDestroy();
+    bc_op_handle_->Run(false);
-}
-TEST_F(BroadcastTester, BroadcastTestSelectedRows) {
+    WaitAll();
-  int gpu_id = 0;
-  BroadcastInitOp<f::SelectedRows>(gpu_id);
+    p::CPUPlace cpu_place;
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-  auto in_var = local_scope_[gpu_id]->Var("input");
+      auto out_var = local_scope_[j]->Var("out");
-  auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+      auto out_tensor = out_var->Get<f::LoDTensor>();
-  auto value = in_selected_rows->mutable_value();
+      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
-  value->mutable_data<float>(kDims, gpu_list_[gpu_id]);
-  int height = kDims[0] * 2;
-  std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
-                            2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
-  in_selected_rows->set_height(height);
-  in_selected_rows->set_rows(rows);
-  std::vector<float> send_vector(f::product(kDims));
-  for (size_t k = 0; k < send_vector.size(); ++k) {
-    send_vector[k] = k;
-  }
-  paddle::framework::TensorFromVector<float>(
-      send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), value);
-  bc_op_handle_->Run(false);
+      f::Tensor result_tensor;
+      f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor);
+      float* ct = result_tensor.mutable_data<float>(cpu_place);
-  ctxs_->WaitAll();
+      for (int64_t j = 0; j < f::product(kDims); ++j) {
+        ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+      }
+    }
-  p::CPUPlace cpu_place;
+    BroadcastDestroy();
-  for (size_t j = 0; j < gpu_list_.size(); ++j) {
+  }
-    auto out_var = local_scope_[j]->Var("out");
-    auto& out_select_rows = out_var->Get<f::SelectedRows>();
-    auto rt = out_select_rows.value();
-    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+  void TestBroadcastSelectedRows() {
-    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+    int input_scope_idx = 0;
-      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
+    BroadcastInitOp<f::SelectedRows>(input_scope_idx);
+    auto in_var = local_scope_[input_scope_idx]->Var("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+    auto value = in_selected_rows->mutable_value();
+    value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+    int height = kDims[0] * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    in_selected_rows->set_height(height);
+    in_selected_rows->set_rows(rows);
+    std::vector<float> send_vector(f::product(kDims));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
    }
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), value);
+    bc_op_handle_->Run(false);
-    f::Tensor result_tensor;
+    WaitAll();
-    f::TensorCopy(rt, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor);
-    float* ct = result_tensor.data<float>();
-    for (int64_t j = 0; j < f::product(kDims); ++j) {
+    p::CPUPlace cpu_place;
-      ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      auto out_var = local_scope_[j]->Var("out");
+      auto& out_select_rows = out_var->Get<f::SelectedRows>();
+      auto rt = out_select_rows.value();
+      PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
+                        "height is not equal.");
+      for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+        PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
+      }
+      f::Tensor result_tensor;
+      f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor);
+      float* ct = result_tensor.data<float>();
+      for (int64_t j = 0; j < f::product(kDims); ++j) {
+        ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+      }
    }
+    BroadcastDestroy();
  }
-  BroadcastDestroy();
+ public:
+  f::Scope g_scope_;
+  std::vector<p::DeviceContext*> ctxs_;
+  std::vector<f::Scope*> local_scope_;
+  std::vector<p::Place> gpu_list_;
+  f::details::BroadcastOpHandle* bc_op_handle_;
+};
+TEST_F(BroadcastTester, TestCPUBroadcastTestLodTensor) {
+  InitCtx(false);
+  TestBroadcastLodTensor();
+}
+TEST_F(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
+  InitCtx(false);
+  TestBroadcastSelectedRows();
+}
+#ifdef PADDLE_WITH_CUDA
+TEST_F(BroadcastTester, TestGPUBroadcastTestLodTensor) {
+  InitCtx(true);
+  TestBroadcastLodTensor();
+}
+TEST_F(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
+  InitCtx(true);
+  TestBroadcastSelectedRows();
 }
+#endif
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -50,6 +50,7 @@ struct VarHandle : public VarHandleBase {
  // version field currently is not used, however, just store the version to
  // debug easily.
  size_t version_;
+  size_t scope_idx_;
  std::string name_;
  platform::Place place_;
 };

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -2,21 +2,19 @@
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
- Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
@@ -140,45 +138,6 @@ template <>
 struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
  using TYPE = CUDAPinnedDeviceContext;
 };
-class ContextMap {
- public:
-  explicit ContextMap(const std::vector<platform::Place>& places) {
-    order_.reserve(places.size());
-    for (auto& p : places) {
-      auto dev = boost::get<CUDAPlace>(p);
-      int dev_id = dev.device;
-      order_.emplace_back(dev_id);
-      contexts_[dev_id].reset(new CUDADeviceContext(dev));
-    }
-    PADDLE_ENFORCE_EQ(
-        order_.size(), contexts_.size(),
-        "Context Map does not support contain two or more same device");
-  }
-  DeviceContext* DevCtx(int dev_id) const { return at(dev_id); }
-  DeviceContext* DevCtx(platform::Place p) const {
-    return DevCtx(boost::get<CUDAPlace>(p).device);
-  }
-  DeviceContext* at(platform::Place p) const {
-    return this->at(boost::get<CUDAPlace>(p).device);
-  }
-  DeviceContext* at(int dev_id) const { return contexts_.at(dev_id).get(); }
-  void WaitAll() {
-    for (auto& p : contexts_) {
-      p.second->Wait();
-    }
-  }
- private:
-  std::unordered_map<int, std::unique_ptr<DeviceContext>> contexts_;
-  std::vector<int> order_;
-};
 #endif
 #ifdef PADDLE_WITH_MKLDNN