diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 2a87f02bd566f6b6d8326827a9cd5f0ae648de6c..3644ed9cb7f7106f7c27be43cfedb27d681b9b7d 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -5,22 +5,22 @@ cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod if(WITH_GPU) nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda) - nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) -endif() -cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) -cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) -cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) - -if(WITH_GPU) set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) else() set(multi_devices_graph_builder_deps) endif() +cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) +cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) +cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle scale_loss_grad_op_handle ${multi_devices_graph_builder_deps}) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) +cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) +cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory device_context broadcast_op_handle) +cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory + device_context gather_op_handle) diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 2c99a347bf520da66c19db1812cdfbd2ca23babf..7cd13a50f558ad03b6811ddf55489509558ccd1d 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -18,7 +18,7 @@ namespace paddle { namespace framework { namespace details { -Tensor *GetTensorFromVar(Variable *in_var) { +static Tensor *GetTensorFromVar(Variable *in_var) { if (in_var->IsType()) { return in_var->GetMutable(); } else if (in_var->IsType()) { @@ -52,29 +52,34 @@ void BroadcastOpHandle::RunImpl() { auto &out_p = out_handle->place_; auto out_scope_idx = out_handle->scope_idx_; - PADDLE_ENFORCE_LT(out_scope_idx, local_scopes_.size(), ""); + PADDLE_ENFORCE_LT(out_scope_idx, local_scopes_.size(), + "%s is not the the local_scopes ", out_handle->name_); auto *s = local_scopes_[out_scope_idx]; auto out_var = s->FindVar(out_handle->name_); - PADDLE_ENFORCE_EQ(out_var->Type(), in_var->Type(), ""); + PADDLE_ENFORCE_EQ( + out_var->Type(), in_var->Type(), + "The type of input and output is not equal. (%s_%d vs %s_%d)", + out_handle->name_, out_handle->scope_idx_, in_var_handle->name_, + in_var_handle->scope_idx_); if (in_var->IsType()) { - auto in_sr = in_var->GetMutable(); - auto out = out_var->GetMutable(); - if (in_sr == out) continue; - out->set_height(in_sr->height()); - out->set_rows(in_sr->rows()); - out->mutable_value()->Resize(in_sr->value().dims()); - out->mutable_value()->mutable_data(out_p, in_sr->value().type()); + auto &in_sr = in_var->Get(); + auto out_sr = out_var->GetMutable(); + if (&in_sr == out_sr) continue; + out_sr->set_height(in_sr.height()); + out_sr->set_rows(in_sr.rows()); + out_sr->mutable_value()->Resize(in_sr.value().dims()); + out_sr->mutable_value()->mutable_data(out_p, in_sr.value().type()); } else if (in_var->IsType()) { - auto in_lod = in_var->GetMutable(); - auto out = out_var->GetMutable(); - if (in_lod == out) continue; - out->set_lod(in_lod->lod()); - out->Resize(in_lod->dims()); - out->mutable_data(out_p, in_lod->type()); + auto in_lod = in_var->Get(); + auto out_lod = out_var->GetMutable(); + if (&in_lod == out_lod) continue; + out_lod->set_lod(in_lod.lod()); + out_lod->Resize(in_lod.dims()); + out_lod->mutable_data(out_p, in_lod.type()); } else { - PADDLE_THROW("Var should be LoDTensor or SelectedRows"); + PADDLE_THROW("Var should be LoDTensor or SelectedRows."); } Tensor *out_tensor = GetTensorFromVar(out_var); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 06ec164ce083d0a7b4607686dcdb3dbeb5755fec..74c0a6a09804d2614954f277c4669c4502ee5c02 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -35,7 +35,6 @@ namespace details { struct BroadcastOpHandle : public OpHandleBase { const std::vector &local_scopes_; const std::vector &places_; - // const platform::ContextMap &ctxs_; BroadcastOpHandle(const std::vector &local_scopes, const std::vector &places); diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc index d03115f0be6901c8b409efe229daa87b9cbee9a1..29cf120c76dc270c162ebff5f21f4c149ffa3b51 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc @@ -84,7 +84,7 @@ class BroadcastTester : public ::testing::Test { bc_op_handle_->AddOutput(out_var_handle); } } - void BroadcastDestroy() { + void BroadcastOpDestroy() { for (auto in : bc_op_handle_->inputs_) { delete in; } @@ -139,7 +139,7 @@ class BroadcastTester : public ::testing::Test { } } - BroadcastDestroy(); + BroadcastOpDestroy(); } void TestBroadcastSelectedRows() { @@ -188,7 +188,7 @@ class BroadcastTester : public ::testing::Test { } } - BroadcastDestroy(); + BroadcastOpDestroy(); } public: diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..940786837223eb842f776488577dd9b87d1ad0da --- /dev/null +++ b/paddle/fluid/framework/details/gather_op_handle.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/gather_op_handle.h" + +namespace paddle { +namespace framework { +namespace details { + +static Tensor *GetTensorFromVar(Variable *in_var) { + if (in_var->IsType()) { + return in_var->GetMutable(); + } else if (in_var->IsType()) { + return in_var->GetMutable()->mutable_value(); + } else { + PADDLE_THROW("Var should be LoDTensor or SelectedRows"); + } + return nullptr; +} +GatherOpHandle::GatherOpHandle(const std::vector &local_scopes, + const std::vector &places) + : local_scopes_(local_scopes), places_(places) {} + +void GatherOpHandle::RunImpl() { + PADDLE_ENFORCE_EQ(this->inputs_.size(), places_.size()); + PADDLE_ENFORCE_EQ(this->outputs_.size(), 1); + + // Wait input done, this Wait is asynchronous operation + for (auto *in : inputs_) { + if (inputs_[0]->generated_op_) { + auto &p = static_cast(in)->place_; + in->generated_op_->Wait(dev_ctxes_[p]); + } + } + auto in_0_handle = static_cast(inputs_[0]); + auto pre_in_var = + local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_); + + std::vector out_rows; + std::vector in_tensors; + std::vector in_places; + + // gather the inputs + for (auto *in : inputs_) { + auto in_handle = static_cast(in); + auto in_p = in_handle->place_; + in_places.push_back(in_p); + PADDLE_ENFORCE_LT(in_handle->scope_idx_, local_scopes_.size(), + "%s is not the the local_scopes ", in_handle->name_); + + auto *s = local_scopes_[in_handle->scope_idx_]; + auto in_var = s->FindVar(in_handle->name_); + PADDLE_ENFORCE_EQ(in_var->Type(), pre_in_var->Type(), + "The type of input is not consistent."); + + if (in_var->IsType()) { + auto &pre_in = pre_in_var->Get(); + auto &in_sr = in_var->Get(); + auto in_sr_rows = in_sr.rows(); + out_rows.insert(out_rows.begin(), in_sr_rows.begin(), in_sr_rows.end()); + PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(), ""); + PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), ""); + } else if (in_var->IsType()) { + auto &pre_in = pre_in_var->Get(); + auto &in_lodtensor = in_var->Get(); + PADDLE_ENFORCE_EQ(in_lodtensor.lod(), pre_in.lod()); + PADDLE_ENFORCE_EQ(in_lodtensor.dims(), pre_in.dims()); + } else { + PADDLE_THROW("Var should be LoDTensor or SelectedRows."); + } + in_tensors.push_back(GetTensorFromVar(in_var)); + pre_in_var = in_var; + } + + // write the output + auto out_handle = static_cast(this->outputs_[0]); + auto &out_place = out_handle->place_; + auto out_scope_idx = out_handle->scope_idx_; + auto out_var = local_scopes_[out_scope_idx]->FindVar(out_handle->name_); + + if (pre_in_var->IsType()) { + auto &pre_in = pre_in_var->Get(); + auto out = out_var->GetMutable(); + out->set_height(pre_in.height()); + out->set_rows(out_rows); + size_t rows = out_rows.size(); + DDim out_dim = pre_in.GetCompleteDims(); + out_dim[0] = static_cast(rows); + out->mutable_value()->Resize(out_dim); + out->mutable_value()->mutable_data(out_place, pre_in.value().type()); + auto out_tensor = out->mutable_value(); + // copy + int s = 0, e = 0; + for (size_t j = 0; j < in_tensors.size(); ++j) { + e += in_tensors[j]->dims()[0]; + auto sub_out = out_tensor->Slice(s, e); + paddle::framework::TensorCopy(*(in_tensors[j]), out_place, + *(dev_ctxes_[in_places[j]]), &sub_out); + s = e; + } + } else if (pre_in_var->IsType()) { + } else { + PADDLE_THROW("Var should be LoDTensor or SelectedRows."); + } +} + +std::string GatherOpHandle::Name() const { return "broadcast"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..48e1db227bd8c18524b91717b0490c79a2a7156d --- /dev/null +++ b/paddle/fluid/framework/details/gather_op_handle.h @@ -0,0 +1,52 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +/* + * Broadcast the input to all scope. + * + */ +struct GatherOpHandle : public OpHandleBase { + const std::vector &local_scopes_; + const std::vector &places_; + + GatherOpHandle(const std::vector &local_scopes, + const std::vector &places); + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override { return false; }; + + protected: + void RunImpl() override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a029a2d2666256eff35656908b878c075ae6b66c --- /dev/null +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -0,0 +1,227 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/gather_op_handle.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/platform/device_context.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; + +// test data amount +const f::DDim kDims = {20, 20}; + +class GatherTester : public ::testing::Test { + public: + void InitCtx(bool use_gpu) { + if (use_gpu) { +#ifdef PADDLE_WITH_CUDA + int count = p::GetCUDADeviceCount(); + if (count <= 1) { + LOG(WARNING) << "Cannot test multi-gpu Gather, because the CUDA " + "device count is " + << count; + exit(0); + } + for (int i = 0; i < count; ++i) { + auto p = p::CUDAPlace(i); + gpu_list_.push_back(p); + ctxs_.emplace_back(new p::CUDADeviceContext(p)); + } +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { + int count = 8; + for (int i = 0; i < count; ++i) { + auto p = p::CPUPlace(); + gpu_list_.push_back(p); + ctxs_.emplace_back(new p::CPUDeviceContext(p)); + } + } + } + + template + void InitGatherOp(int input_scope_idx) { + for (size_t j = 0; j < gpu_list_.size(); ++j) { + local_scope_.push_back(&g_scope_.NewScope()); + auto* out_var = local_scope_[j]->Var("input"); + out_var->GetMutable(); + } + auto* in_var = local_scope_[input_scope_idx]->Var("out"); + in_var->GetMutable(); + + gather_op_handle_ = new f::details::GatherOpHandle(local_scope_, gpu_list_); + + f::details::VarHandle* out_var_handle = new f::details::VarHandle(); + out_var_handle->place_ = gpu_list_[input_scope_idx]; + out_var_handle->name_ = "out"; + out_var_handle->version_ = 2; + out_var_handle->scope_idx_ = input_scope_idx; + out_var_handle->generated_op_ = gather_op_handle_; + gather_op_handle_->AddOutput(out_var_handle); + + for (size_t j = 0; j < gpu_list_.size(); ++j) { + gather_op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j]; + f::details::VarHandle* in_var_handle = new f::details::VarHandle(); + in_var_handle->place_ = gpu_list_[j]; + in_var_handle->name_ = "input"; + in_var_handle->version_ = 1; + in_var_handle->scope_idx_ = j; + in_var_handle->generated_op_ = nullptr; + gather_op_handle_->AddInput(in_var_handle); + } + } + void GatherOpDestroy() { + for (auto in : gather_op_handle_->inputs_) { + delete in; + } + for (auto out : gather_op_handle_->outputs_) { + delete out; + } + delete gather_op_handle_; + for (size_t j = 0; j < ctxs_.size(); ++j) { + delete ctxs_[j]; + } + } + + void WaitAll() { + for (size_t j = 0; j < ctxs_.size(); ++j) { + ctxs_[j]->Wait(); + } + } + + void TestGatherLodTensor() { + // int input_scope_idx = 0; + // InitGatherOp(input_scope_idx); + // + // auto in_var = local_scope_[input_scope_idx]->Var("input"); + // auto in_lod_tensor = in_var->GetMutable(); + // in_lod_tensor->mutable_data(kDims, gpu_list_[input_scope_idx]); + // + // std::vector send_vector(f::product(kDims), input_scope_idx + + // 12); + // for (size_t k = 0; k < send_vector.size(); ++k) { + // send_vector[k] = k; + // } + // f::LoD lod{{0, 10, 20}}; + // paddle::framework::TensorFromVector( + // send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor); + // in_lod_tensor->set_lod(lod); + // + // gather_op_handle_->Run(false); + // + // WaitAll(); + // + // p::CPUPlace cpu_place; + // for (size_t j = 0; j < gpu_list_.size(); ++j) { + // auto out_var = local_scope_[j]->Var("out"); + // auto out_tensor = out_var->Get(); + // PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal."); + // + // f::Tensor result_tensor; + // f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor); + // float* ct = result_tensor.mutable_data(cpu_place); + // + // for (int64_t j = 0; j < f::product(kDims); ++j) { + // ASSERT_NEAR(ct[j], send_vector[j], 1e-5); + // } + // } + // + // GatherOpDestroy(); + } + + void TestGatherSelectedRows() { + int output_scope_idx = 0; + InitGatherOp(output_scope_idx); + + int height = kDims[0] * 2; + std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, + 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; + std::vector send_vector(f::product(kDims)); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k; + } + + for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); + ++input_scope_idx) { + auto in_var = local_scope_[input_scope_idx]->Var("input"); + auto in_selected_rows = in_var->GetMutable(); + auto value = in_selected_rows->mutable_value(); + value->mutable_data(kDims, gpu_list_[input_scope_idx]); + + in_selected_rows->set_height(height); + in_selected_rows->set_rows(rows); + + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), value); + value->Resize(kDims); + } + + gather_op_handle_->Run(false); + + WaitAll(); + + p::CPUPlace cpu_place; + + auto out_var = local_scope_[output_scope_idx]->Var("out"); + auto& out_select_rows = out_var->Get(); + auto rt = out_select_rows.value(); + + PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); + for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { + PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); + } + + f::Tensor result_tensor; + f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor); + float* ct = result_tensor.data(); + + for (int64_t j = 0; j < f::product(kDims); ++j) { + ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5); + } + + GatherOpDestroy(); + } + + public: + f::Scope g_scope_; + std::vector ctxs_; + std::vector local_scope_; + std::vector gpu_list_; + f::details::GatherOpHandle* gather_op_handle_; +}; + +// TEST_F(GatherTester, TestCPUGatherTestLodTensor) { +// InitCtx(false); +// TestGatherLodTensor(); +//} + +TEST_F(GatherTester, TestCPUGatherTestSelectedRows) { + InitCtx(false); + TestGatherSelectedRows(); +} + +#ifdef PADDLE_WITH_CUDA +// TEST_F(GatherTester, TestGPUGatherTestLodTensor) { +// InitCtx(true); +// TestGatherLodTensor(); +//} + +TEST_F(GatherTester, TestGPUGatherTestSelectedRows) { + InitCtx(true); + TestGatherSelectedRows(); +} +#endif