From 8eaec5dd7c5d627aa2d23db1fc518a1e85a30821 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 9 Apr 2018 15:28:07 +0800 Subject: [PATCH] add BCast and Gather --- paddle/fluid/framework/details/CMakeLists.txt | 8 +- .../framework/details/broad_cast_op_handle.cc | 103 +++++++++++ .../framework/details/broad_cast_op_handle.h | 54 ++++++ .../details/broad_cast_op_handle_test.cc | 174 ++++++++++++++++++ paddle/fluid/platform/device_context.h | 46 ++++- 5 files changed, 382 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/framework/details/broad_cast_op_handle.cc create mode 100644 paddle/fluid/framework/details/broad_cast_op_handle.h create mode 100644 paddle/fluid/framework/details/broad_cast_op_handle_test.cc diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 89b5c6847f..eda2b6aac0 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -2,8 +2,12 @@ cc_library(var_handle SRCS var_handle.cc DEPS place) cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) -nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory +if(WITH_GPU) + nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda) + nv_library(broad_cast_op_handle SRCS broad_cast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) +endif() + cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) @@ -11,6 +15,8 @@ cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) if(WITH_GPU) set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) + nv_test(broad_cast_op_test SRCS broad_cast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory + device_context broad_cast_op_handle) else() set(multi_devices_graph_builder_deps) endif() diff --git a/paddle/fluid/framework/details/broad_cast_op_handle.cc b/paddle/fluid/framework/details/broad_cast_op_handle.cc new file mode 100644 index 0000000000..e636371b94 --- /dev/null +++ b/paddle/fluid/framework/details/broad_cast_op_handle.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/broad_cast_op_handle.h" + +namespace paddle { +namespace framework { +namespace details { + +Tensor *GetTensorFromVar(Variable *in_var) { + if (in_var->IsType()) { + return in_var->GetMutable(); + } else if (in_var->IsType()) { + return in_var->GetMutable()->mutable_value(); + } else { + PADDLE_THROW("Var should be LoDTensor or SelectedRows"); + } + return nullptr; +} +BCastOpHandle::BCastOpHandle(const std::vector &local_scopes, + const std::vector &places, + const platform::ContextMap &ctxs) + : local_scopes_(local_scopes), places_(places), ctxs_(ctxs) { + for (auto &p : places_) { + this->dev_ctxes_[p] = ctxs_.DevCtx(p); + } +} + +void BCastOpHandle::RunImpl() { + PADDLE_ENFORCE_EQ(this->inputs_.size(), 1); + PADDLE_ENFORCE_EQ(this->outputs_.size(), places_.size()); + + // Wait input done, this Wait is asynchronous operation + auto in_var_handle = static_cast(this->inputs_[0]); + auto &in_place = in_var_handle->place_; + if (inputs_[0]->generated_op_) + inputs_[0]->generated_op_->Wait(dev_ctxes_[in_place]); + + auto iter = std::find(places_.begin(), places_.end(), in_place); + if (iter == places_.end()) { + PADDLE_THROW("The input of BCast is not in the places_."); + } + + int offset = iter - places_.begin(); + auto in_var = local_scopes_[offset]->FindVar(in_var_handle->name_); + + Tensor *in_tensor = GetTensorFromVar(in_var); + for (auto *out : outputs_) { + auto out_handle = static_cast(out); + auto &out_p = out_handle->place_; + + auto iter = std::find(places_.begin(), places_.end(), out_p); + if (iter == places_.end()) { + PADDLE_THROW("The output of BCast is not in the places_."); + } + int offset = iter - places_.begin(); + + auto *s = local_scopes_[offset]; + auto out_var = s->FindVar(out_handle->name_); + + PADDLE_ENFORCE_EQ(out_var->Type(), in_var->Type(), ""); + + if (in_var->IsType()) { + auto in_sr = in_var->GetMutable(); + auto out = out_var->GetMutable(); + if (in_sr == out) continue; + out->set_height(in_sr->height()); + out->set_rows(in_sr->rows()); + out->mutable_value()->Resize(in_sr->value().dims()); + out->mutable_value()->mutable_data(out_p, in_sr->value().type()); + } else if (in_var->IsType()) { + auto in_lod = in_var->GetMutable(); + auto out = out_var->GetMutable(); + if (in_lod == out) continue; + out->set_lod(in_lod->lod()); + out->Resize(in_lod->dims()); + out->mutable_data(out_p, in_lod->type()); + } else { + PADDLE_THROW("Var should be LoDTensor or SelectedRows"); + } + + Tensor *out_tensor = GetTensorFromVar(out_var); + + paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]), + out_tensor); + } +} + +std::string BCastOpHandle::Name() const { return "broadcast"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/broad_cast_op_handle.h b/paddle/fluid/framework/details/broad_cast_op_handle.h new file mode 100644 index 0000000000..432e86e410 --- /dev/null +++ b/paddle/fluid/framework/details/broad_cast_op_handle.h @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +/* + * BroadCast the input to all scope. + * + */ +struct BCastOpHandle : public OpHandleBase { + const std::vector &local_scopes_; + const std::vector &places_; + const platform::ContextMap &ctxs_; + + BCastOpHandle(const std::vector &local_scopes, + const std::vector &places, + const platform::ContextMap &ctxs); + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override { return false; }; + + protected: + void RunImpl() override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/broad_cast_op_handle_test.cc b/paddle/fluid/framework/details/broad_cast_op_handle_test.cc new file mode 100644 index 0000000000..a1338abeb5 --- /dev/null +++ b/paddle/fluid/framework/details/broad_cast_op_handle_test.cc @@ -0,0 +1,174 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/broad_cast_op_handle.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/platform/device_context.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; + +// test data amount +const f::DDim kDims = {20, 20}; + +class BroadCastTester : public ::testing::Test { + public: + void SetUp() override { + int count = p::GetCUDADeviceCount(); + if (count <= 1) { + LOG(WARNING) << "Cannot test multi-gpu BroadCast, because the CUDA " + "device count is " + << count; + exit(0); + } + for (int i = 0; i < count; ++i) { + gpu_list_.emplace_back(p::CUDAPlace(i)); + } + ctxs_ = new p::ContextMap(gpu_list_); + } + + template + void BroadCastInitOp(int gpu_id = 0) { + for (size_t j = 0; j < gpu_list_.size(); ++j) { + local_scope_.push_back(&g_scope_.NewScope()); + auto* out_var = local_scope_[j]->Var("out"); + out_var->GetMutable(); + } + auto* in_var = local_scope_[gpu_id]->Var("input"); + in_var->GetMutable(); + + bc_op_handle_ = + new f::details::BCastOpHandle(local_scope_, gpu_list_, *ctxs_); + + f::details::VarHandle* in_var_handle = new f::details::VarHandle(); + in_var_handle->place_ = gpu_list_[gpu_id]; + in_var_handle->name_ = "input"; + in_var_handle->version_ = 1; + in_var_handle->generated_op_ = nullptr; + bc_op_handle_->AddInput(in_var_handle); + + for (size_t j = 0; j < gpu_list_.size(); ++j) { + f::details::VarHandle* out_var_handle = new f::details::VarHandle(); + out_var_handle->place_ = gpu_list_[j]; + out_var_handle->name_ = "out"; + out_var_handle->version_ = 2; + out_var_handle->generated_op_ = bc_op_handle_; + bc_op_handle_->AddOutput(out_var_handle); + } + } + void BroadCastDestroy() { + delete ctxs_; + for (auto in : bc_op_handle_->inputs_) { + delete in; + } + for (auto out : bc_op_handle_->outputs_) { + delete out; + } + delete bc_op_handle_; + } + + public: + f::Scope g_scope_; + p::ContextMap* ctxs_; + std::vector local_scope_; + std::vector gpu_list_; + f::details::BCastOpHandle* bc_op_handle_; +}; + +TEST_F(BroadCastTester, BroadCastTestLodTensor) { + int gpu_id = 0; + BroadCastInitOp(gpu_id); + + auto in_var = local_scope_[gpu_id]->Var("input"); + auto in_lod_tensor = in_var->GetMutable(); + in_lod_tensor->mutable_data(kDims, gpu_list_[gpu_id]); + + std::vector send_vector(f::product(kDims), gpu_id + 12); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k; + } + f::LoD lod{{0, 10, 20}}; + paddle::framework::TensorFromVector( + send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), in_lod_tensor); + in_lod_tensor->set_lod(lod); + bc_op_handle_->Run(false); + + ctxs_->WaitAll(); + + p::CPUPlace cpu_place; + for (size_t j = 0; j < gpu_list_.size(); ++j) { + auto out_var = local_scope_[j]->Var("out"); + auto out_tensor = out_var->Get(); + PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal."); + + f::Tensor result_tensor; + f::TensorCopy(out_tensor, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor); + float* ct = result_tensor.mutable_data(cpu_place); + + for (int64_t j = 0; j < f::product(kDims); ++j) { + ASSERT_NEAR(ct[j], send_vector[j], 1e-5); + } + } + + BroadCastDestroy(); +} + +TEST_F(BroadCastTester, BroadCastTestSelectedRows) { + int gpu_id = 0; + BroadCastInitOp(gpu_id); + + auto in_var = local_scope_[gpu_id]->Var("input"); + auto in_selected_rows = in_var->GetMutable(); + auto value = in_selected_rows->mutable_value(); + value->mutable_data(kDims, gpu_list_[gpu_id]); + int height = kDims[0] * 2; + std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, + 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; + in_selected_rows->set_height(height); + in_selected_rows->set_rows(rows); + + std::vector send_vector(f::product(kDims)); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k; + } + paddle::framework::TensorFromVector( + send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), value); + + bc_op_handle_->Run(false); + + ctxs_->WaitAll(); + + p::CPUPlace cpu_place; + for (size_t j = 0; j < gpu_list_.size(); ++j) { + auto out_var = local_scope_[j]->Var("out"); + auto& out_select_rows = out_var->Get(); + auto rt = out_select_rows.value(); + + PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); + for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { + PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]); + } + + f::Tensor result_tensor; + f::TensorCopy(rt, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor); + float* ct = result_tensor.data(); + + for (int64_t j = 0; j < f::product(kDims); ++j) { + ASSERT_NEAR(ct[j], send_vector[j], 1e-5); + } + } + + BroadCastDestroy(); +} diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 6b796d92d0..fceb5845ff 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -2,17 +2,20 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software + + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once - #include +#include #include +#include #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/dynload/cublas.h" @@ -137,6 +140,45 @@ template <> struct DefaultDeviceContextType { using TYPE = CUDAPinnedDeviceContext; }; + +class ContextMap { + public: + explicit ContextMap(const std::vector& places) { + order_.reserve(places.size()); + for (auto& p : places) { + auto dev = boost::get(p); + int dev_id = dev.device; + order_.emplace_back(dev_id); + contexts_[dev_id].reset(new CUDADeviceContext(dev)); + } + PADDLE_ENFORCE_EQ( + order_.size(), contexts_.size(), + "Context Map does not support contain two or more same device"); + } + + DeviceContext* DevCtx(int dev_id) const { return at(dev_id); } + + DeviceContext* DevCtx(platform::Place p) const { + return DevCtx(boost::get(p).device); + } + + DeviceContext* at(platform::Place p) const { + return this->at(boost::get(p).device); + } + + DeviceContext* at(int dev_id) const { return contexts_.at(dev_id).get(); } + + void WaitAll() { + for (auto& p : contexts_) { + p.second->Wait(); + } + } + + private: + std::unordered_map> contexts_; + std::vector order_; +}; + #endif #ifdef PADDLE_WITH_MKLDNN -- GitLab