diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 1bcd8412eb2d618b923bcd0557d118af62271f4a..180a2adf91d06390d7f00d06591b81a99c03955c 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -12,7 +12,7 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro if(WITH_GPU) nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory - dynload_cuda) + dynload_cuda variable_visitor) set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda) nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) @@ -24,6 +24,7 @@ else() endif() cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) +cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle) diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..32415c192f0be51bf0850fe533c212c635779a30 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fuse_vars_op_handle.h" + +namespace paddle { +namespace framework { +namespace details { + +void FuseVarsOpHandle::RunImpl() { + WaitInputVarGenerated(place_); + + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ(in_var_handles.size(), 0); + PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), ""); + + auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get(); + + auto out_var_handle = out_var_handles[0]; + auto out_var = scope->Var(out_var_handle->name_); + + auto out_tensor = out_var->GetMutable(); + out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_); + + int64_t s = 0; + for (size_t i = 1; i < out_var_handles.size(); ++i) { + auto out_name = out_var_handles[i]->name_; + auto out_t = scope->Var(out_name)->GetMutable(); + auto numel = this->inputs_numel_.at(out_name); + out_t->ShareDataWith(out_tensor->Slice(s, s + numel)); + s += numel; + } + this->RunAndRecordEvent([this] {}); +} + +std::string FuseVarsOpHandle::Name() const { return "fuse vars"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..140fb5bb49a33146de974b6d79559b4cf15bdd7b --- /dev/null +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +struct FuseVarsOpHandle : public OpHandleBase { + public: + FuseVarsOpHandle(Scope *local_scope, const platform::Place &place, + const std::unordered_map &inputs_numel, + const std::type_index &var_type) + : local_scope_(local_scope), + place_(place), + inputs_numel_(inputs_numel), + type_(var_type) { + total_numel_ = 0; + for (auto in_numel : inputs_numel) { + PADDLE_ENFORCE_GT(in_numel.second, 0); + total_numel_ += in_numel.second; + } + } + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override { return false; }; + + protected: + void RunImpl() override; + + private: + Scope *local_scope_; + const platform::Place place_; + const std::unordered_map inputs_numel_; + const std::type_index type_; + int64_t total_numel_; +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc index 95aa599cd3e403e9cc66b2b5ad35d0d214d1ab5b..5bba089ade801a06e0364835efe5249105dcfcac 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -11,10 +11,12 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include +#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" -#include #include "paddle/fluid/framework/details/reduce_and_gather.h" +#include "paddle/fluid/framework/details/variable_visitor.h" namespace paddle { namespace framework { @@ -30,27 +32,34 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle( } void NCCLAllReduceOpHandle::RunImpl() { - if (inputs_.size() == 1) { + if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; } else { // Wait input done WaitInputVarGenerated(); - - auto &var_name = static_cast(this->inputs_[0])->name_; - int dtype = -1; - size_t numel = 0; + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); std::vector lod_tensors; - for (size_t i = 0; i < local_scopes_.size(); ++i) { auto *s = local_scopes_[i]; auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get(); - - auto &lod_tensor = local_scope.FindVar(var_name)->Get(); + auto &lod_tensor = + local_scope.FindVar(in_var_handles[i]->name_)->Get(); lod_tensors.emplace_back(&lod_tensor); + PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_, + "The name of input and output should be equal."); } if (platform::is_gpu_place(lod_tensors[0]->place())) { + int dtype = -1; + size_t numel = 0; std::vector> all_reduce_calls; for (size_t i = 0; i < local_scopes_.size(); ++i) { auto &p = places_[i]; @@ -96,7 +105,7 @@ void NCCLAllReduceOpHandle::RunImpl() { auto &scope = *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); auto &p = places_[i]; - auto *var = scope.FindVar(var_name); + auto *var = scope.FindVar(in_var_handles[i]->name_); auto *dev_ctx = dev_ctxes_[p]; RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h index a0c321843e3fc5abcbd1ef2ce2e153250269aa7d..0526f9e9d4778e626d456f375bb74990af351914 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h @@ -41,8 +41,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase { void RunImpl() override; private: - const std::vector &local_scopes_; - const std::vector &places_; + const std::vector local_scopes_; + const std::vector places_; const platform::NCCLContextMap &nccl_ctxs_; }; diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 6b064650b4f09737836bda4a43fa421720077929..3849cca59a3347137b769f97261cfbf97da8d6ff 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -104,6 +104,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { } } +size_t OpHandleBase::NoDummyInputSize() const { + size_t cnt = 0; + for (auto *in : inputs_) { + if (dynamic_cast(in) == nullptr) { + ++cnt; + } + } + return cnt; +} + bool OpHandleBase::NeedWait(VarHandleBase *in_var) { return in_var && in_var->generated_op_; } diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 8f94206a87dbae8a81727ca48718886bbabbe25c..dc92b0fe9f760d95d4869fdd56c0400b6710437f 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -80,6 +80,8 @@ class OpHandleBase { const std::vector &Outputs() const { return outputs_; } + size_t NoDummyInputSize() const; + protected: void RunAndRecordEvent(const std::function &callback);