提交 124c9308 编写于 作者: C chengduoZH

remove ContextMap

上级 6db96ec2
...@@ -7,16 +7,12 @@ if(WITH_GPU) ...@@ -7,16 +7,12 @@ if(WITH_GPU)
dynload_cuda) dynload_cuda)
nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
endif() endif()
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
if(WITH_GPU) if(WITH_GPU)
set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
nv_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
device_context broadcast_op_handle)
else() else()
set(multi_devices_graph_builder_deps) set(multi_devices_graph_builder_deps)
endif() endif()
...@@ -25,3 +21,6 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ...@@ -25,3 +21,6 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context) simple_threadpool device_context)
cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
device_context broadcast_op_handle)
...@@ -29,13 +29,8 @@ Tensor *GetTensorFromVar(Variable *in_var) { ...@@ -29,13 +29,8 @@ Tensor *GetTensorFromVar(Variable *in_var) {
return nullptr; return nullptr;
} }
BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes, BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places)
const platform::ContextMap &ctxs) : local_scopes_(local_scopes), places_(places) {}
: local_scopes_(local_scopes), places_(places), ctxs_(ctxs) {
for (auto &p : places_) {
this->dev_ctxes_[p] = ctxs_.DevCtx(p);
}
}
void BroadcastOpHandle::RunImpl() { void BroadcastOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ(this->inputs_.size(), 1); PADDLE_ENFORCE_EQ(this->inputs_.size(), 1);
...@@ -47,26 +42,18 @@ void BroadcastOpHandle::RunImpl() { ...@@ -47,26 +42,18 @@ void BroadcastOpHandle::RunImpl() {
if (inputs_[0]->generated_op_) if (inputs_[0]->generated_op_)
inputs_[0]->generated_op_->Wait(dev_ctxes_[in_place]); inputs_[0]->generated_op_->Wait(dev_ctxes_[in_place]);
auto iter = std::find(places_.begin(), places_.end(), in_place); auto in_scope_idx = in_var_handle->scope_idx_;
if (iter == places_.end()) { PADDLE_ENFORCE_LT(in_scope_idx, local_scopes_.size(), "");
PADDLE_THROW("The input of BCast is not in the places_."); auto in_var = local_scopes_[in_scope_idx]->FindVar(in_var_handle->name_);
}
int offset = iter - places_.begin();
auto in_var = local_scopes_[offset]->FindVar(in_var_handle->name_);
Tensor *in_tensor = GetTensorFromVar(in_var); Tensor *in_tensor = GetTensorFromVar(in_var);
for (auto *out : outputs_) { for (auto *out : outputs_) {
auto out_handle = static_cast<VarHandle *>(out); auto out_handle = static_cast<VarHandle *>(out);
auto &out_p = out_handle->place_; auto &out_p = out_handle->place_;
auto iter = std::find(places_.begin(), places_.end(), out_p); auto out_scope_idx = out_handle->scope_idx_;
if (iter == places_.end()) { PADDLE_ENFORCE_LT(out_scope_idx, local_scopes_.size(), "");
PADDLE_THROW("The output of BCast is not in the places_."); auto *s = local_scopes_[out_scope_idx];
}
int offset = iter - places_.begin();
auto *s = local_scopes_[offset];
auto out_var = s->FindVar(out_handle->name_); auto out_var = s->FindVar(out_handle->name_);
PADDLE_ENFORCE_EQ(out_var->Type(), in_var->Type(), ""); PADDLE_ENFORCE_EQ(out_var->Type(), in_var->Type(), "");
......
...@@ -35,11 +35,10 @@ namespace details { ...@@ -35,11 +35,10 @@ namespace details {
struct BroadcastOpHandle : public OpHandleBase { struct BroadcastOpHandle : public OpHandleBase {
const std::vector<Scope *> &local_scopes_; const std::vector<Scope *> &local_scopes_;
const std::vector<platform::Place> &places_; const std::vector<platform::Place> &places_;
const platform::ContextMap &ctxs_; // const platform::ContextMap &ctxs_;
BroadcastOpHandle(const std::vector<Scope *> &local_scopes, BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places);
const platform::ContextMap &ctxs);
std::string Name() const override; std::string Name() const override;
......
...@@ -25,51 +25,66 @@ const f::DDim kDims = {20, 20}; ...@@ -25,51 +25,66 @@ const f::DDim kDims = {20, 20};
class BroadcastTester : public ::testing::Test { class BroadcastTester : public ::testing::Test {
public: public:
void SetUp() override { void InitCtx(bool use_gpu) {
int count = p::GetCUDADeviceCount(); if (use_gpu) {
if (count <= 1) { #ifdef PADDLE_WITH_CUDA
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " int count = p::GetCUDADeviceCount();
"device count is " if (count <= 1) {
<< count; LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
exit(0); "device count is "
<< count;
exit(0);
}
for (int i = 0; i < count; ++i) {
auto p = p::CUDAPlace(i);
gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p));
}
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
int count = 8;
for (int i = 0; i < count; ++i) {
auto p = p::CPUPlace();
gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p));
}
} }
for (int i = 0; i < count; ++i) {
gpu_list_.emplace_back(p::CUDAPlace(i));
}
ctxs_ = new p::ContextMap(gpu_list_);
} }
template <class T> template <class T>
void BroadcastInitOp(int gpu_id = 0) { void BroadcastInitOp(int input_scope_idx) {
for (size_t j = 0; j < gpu_list_.size(); ++j) { for (size_t j = 0; j < gpu_list_.size(); ++j) {
local_scope_.push_back(&g_scope_.NewScope()); local_scope_.push_back(&g_scope_.NewScope());
auto* out_var = local_scope_[j]->Var("out"); auto* out_var = local_scope_[j]->Var("out");
out_var->GetMutable<T>(); out_var->GetMutable<T>();
} }
auto* in_var = local_scope_[gpu_id]->Var("input"); auto* in_var = local_scope_[input_scope_idx]->Var("input");
in_var->GetMutable<T>(); in_var->GetMutable<T>();
bc_op_handle_ = bc_op_handle_ = new f::details::BroadcastOpHandle(local_scope_, gpu_list_);
new f::details::BroadcastOpHandle(local_scope_, gpu_list_, *ctxs_);
f::details::VarHandle* in_var_handle = new f::details::VarHandle(); f::details::VarHandle* in_var_handle = new f::details::VarHandle();
in_var_handle->place_ = gpu_list_[gpu_id]; in_var_handle->place_ = gpu_list_[input_scope_idx];
in_var_handle->name_ = "input"; in_var_handle->name_ = "input";
in_var_handle->version_ = 1; in_var_handle->version_ = 1;
in_var_handle->scope_idx_ = input_scope_idx;
in_var_handle->generated_op_ = nullptr; in_var_handle->generated_op_ = nullptr;
bc_op_handle_->AddInput(in_var_handle); bc_op_handle_->AddInput(in_var_handle);
for (size_t j = 0; j < gpu_list_.size(); ++j) { for (size_t j = 0; j < gpu_list_.size(); ++j) {
bc_op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j];
f::details::VarHandle* out_var_handle = new f::details::VarHandle(); f::details::VarHandle* out_var_handle = new f::details::VarHandle();
out_var_handle->place_ = gpu_list_[j]; out_var_handle->place_ = gpu_list_[j];
out_var_handle->name_ = "out"; out_var_handle->name_ = "out";
out_var_handle->version_ = 2; out_var_handle->version_ = 2;
out_var_handle->scope_idx_ = j;
out_var_handle->generated_op_ = bc_op_handle_; out_var_handle->generated_op_ = bc_op_handle_;
bc_op_handle_->AddOutput(out_var_handle); bc_op_handle_->AddOutput(out_var_handle);
} }
} }
void BroadcastDestroy() { void BroadcastDestroy() {
delete ctxs_;
for (auto in : bc_op_handle_->inputs_) { for (auto in : bc_op_handle_->inputs_) {
delete in; delete in;
} }
...@@ -77,98 +92,131 @@ class BroadcastTester : public ::testing::Test { ...@@ -77,98 +92,131 @@ class BroadcastTester : public ::testing::Test {
delete out; delete out;
} }
delete bc_op_handle_; delete bc_op_handle_;
for (size_t j = 0; j < ctxs_.size(); ++j) {
delete ctxs_[j];
}
} }
public: void WaitAll() {
f::Scope g_scope_; for (size_t j = 0; j < ctxs_.size(); ++j) {
p::ContextMap* ctxs_; ctxs_[j]->Wait();
std::vector<f::Scope*> local_scope_; }
std::vector<p::Place> gpu_list_; }
f::details::BroadcastOpHandle* bc_op_handle_;
};
TEST_F(BroadcastTester, BroadcastTestLodTensor) { void TestBroadcastLodTensor() {
int gpu_id = 0; int input_scope_idx = 0;
BroadcastInitOp<f::LoDTensor>(gpu_id); BroadcastInitOp<f::LoDTensor>(input_scope_idx);
auto in_var = local_scope_[gpu_id]->Var("input"); auto in_var = local_scope_[input_scope_idx]->Var("input");
auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>(); auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
in_lod_tensor->mutable_data<float>(kDims, gpu_list_[gpu_id]); in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
std::vector<float> send_vector(f::product(kDims), gpu_id + 12); std::vector<float> send_vector(f::product(kDims), input_scope_idx + 12);
for (size_t k = 0; k < send_vector.size(); ++k) { for (size_t k = 0; k < send_vector.size(); ++k) {
send_vector[k] = k; send_vector[k] = k;
}
f::LoD lod{{0, 10, 20}};
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), in_lod_tensor);
in_lod_tensor->set_lod(lod);
bc_op_handle_->Run(false);
ctxs_->WaitAll();
p::CPUPlace cpu_place;
for (size_t j = 0; j < gpu_list_.size(); ++j) {
auto out_var = local_scope_[j]->Var("out");
auto out_tensor = out_var->Get<f::LoDTensor>();
PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
f::Tensor result_tensor;
f::TensorCopy(out_tensor, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor);
float* ct = result_tensor.mutable_data<float>(cpu_place);
for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
} }
} f::LoD lod{{0, 10, 20}};
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
in_lod_tensor->set_lod(lod);
BroadcastDestroy(); bc_op_handle_->Run(false);
}
TEST_F(BroadcastTester, BroadcastTestSelectedRows) { WaitAll();
int gpu_id = 0;
BroadcastInitOp<f::SelectedRows>(gpu_id); p::CPUPlace cpu_place;
for (size_t j = 0; j < gpu_list_.size(); ++j) {
auto in_var = local_scope_[gpu_id]->Var("input"); auto out_var = local_scope_[j]->Var("out");
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>(); auto out_tensor = out_var->Get<f::LoDTensor>();
auto value = in_selected_rows->mutable_value(); PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
value->mutable_data<float>(kDims, gpu_list_[gpu_id]);
int height = kDims[0] * 2;
std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
2, 4, 6, 3, 1, 1, 1, 1, 3, 7};
in_selected_rows->set_height(height);
in_selected_rows->set_rows(rows);
std::vector<float> send_vector(f::product(kDims));
for (size_t k = 0; k < send_vector.size(); ++k) {
send_vector[k] = k;
}
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), value);
bc_op_handle_->Run(false); f::Tensor result_tensor;
f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor);
float* ct = result_tensor.mutable_data<float>(cpu_place);
ctxs_->WaitAll(); for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
}
}
p::CPUPlace cpu_place; BroadcastDestroy();
for (size_t j = 0; j < gpu_list_.size(); ++j) { }
auto out_var = local_scope_[j]->Var("out");
auto& out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); void TestBroadcastSelectedRows() {
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { int input_scope_idx = 0;
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]); BroadcastInitOp<f::SelectedRows>(input_scope_idx);
auto in_var = local_scope_[input_scope_idx]->Var("input");
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
int height = kDims[0] * 2;
std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
2, 4, 6, 3, 1, 1, 1, 1, 3, 7};
in_selected_rows->set_height(height);
in_selected_rows->set_rows(rows);
std::vector<float> send_vector(f::product(kDims));
for (size_t k = 0; k < send_vector.size(); ++k) {
send_vector[k] = k;
} }
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), value);
bc_op_handle_->Run(false);
f::Tensor result_tensor; WaitAll();
f::TensorCopy(rt, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor);
float* ct = result_tensor.data<float>();
for (int64_t j = 0; j < f::product(kDims); ++j) { p::CPUPlace cpu_place;
ASSERT_NEAR(ct[j], send_vector[j], 1e-5); for (size_t j = 0; j < gpu_list_.size(); ++j) {
auto out_var = local_scope_[j]->Var("out");
auto& out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
"height is not equal.");
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
}
f::Tensor result_tensor;
f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor);
float* ct = result_tensor.data<float>();
for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
}
} }
BroadcastDestroy();
} }
BroadcastDestroy(); public:
f::Scope g_scope_;
std::vector<p::DeviceContext*> ctxs_;
std::vector<f::Scope*> local_scope_;
std::vector<p::Place> gpu_list_;
f::details::BroadcastOpHandle* bc_op_handle_;
};
TEST_F(BroadcastTester, TestCPUBroadcastTestLodTensor) {
InitCtx(false);
TestBroadcastLodTensor();
}
TEST_F(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
InitCtx(false);
TestBroadcastSelectedRows();
}
#ifdef PADDLE_WITH_CUDA
TEST_F(BroadcastTester, TestGPUBroadcastTestLodTensor) {
InitCtx(true);
TestBroadcastLodTensor();
}
TEST_F(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
InitCtx(true);
TestBroadcastSelectedRows();
} }
#endif
...@@ -50,6 +50,7 @@ struct VarHandle : public VarHandleBase { ...@@ -50,6 +50,7 @@ struct VarHandle : public VarHandleBase {
// version field currently is not used, however, just store the version to // version field currently is not used, however, just store the version to
// debug easily. // debug easily.
size_t version_; size_t version_;
size_t scope_idx_;
std::string name_; std::string name_;
platform::Place place_; platform::Place place_;
}; };
......
...@@ -2,21 +2,19 @@ ...@@ -2,21 +2,19 @@
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
...@@ -140,45 +138,6 @@ template <> ...@@ -140,45 +138,6 @@ template <>
struct DefaultDeviceContextType<platform::CUDAPinnedPlace> { struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
using TYPE = CUDAPinnedDeviceContext; using TYPE = CUDAPinnedDeviceContext;
}; };
class ContextMap {
public:
explicit ContextMap(const std::vector<platform::Place>& places) {
order_.reserve(places.size());
for (auto& p : places) {
auto dev = boost::get<CUDAPlace>(p);
int dev_id = dev.device;
order_.emplace_back(dev_id);
contexts_[dev_id].reset(new CUDADeviceContext(dev));
}
PADDLE_ENFORCE_EQ(
order_.size(), contexts_.size(),
"Context Map does not support contain two or more same device");
}
DeviceContext* DevCtx(int dev_id) const { return at(dev_id); }
DeviceContext* DevCtx(platform::Place p) const {
return DevCtx(boost::get<CUDAPlace>(p).device);
}
DeviceContext* at(platform::Place p) const {
return this->at(boost::get<CUDAPlace>(p).device);
}
DeviceContext* at(int dev_id) const { return contexts_.at(dev_id).get(); }
void WaitAll() {
for (auto& p : contexts_) {
p.second->Wait();
}
}
private:
std::unordered_map<int, std::unique_ptr<DeviceContext>> contexts_;
std::vector<int> order_;
};
#endif #endif
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册