diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 3c73b6cc55c187c3f6e7edd1ce38cc58f4e8413d..4fb4ec38ee965a2790d11378a1ce6befa0ef5a00 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -25,11 +25,12 @@ else() cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) endif() +cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle - scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle) + scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle) cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker) diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..786d95acb1815e856f684ee6c64664a3297160b2 --- /dev/null +++ b/paddle/fluid/framework/details/data_balance_op_handle.cc @@ -0,0 +1,138 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/data_balance_op_handle.h" +#include +#include "paddle/fluid/framework/details/container_cast.h" + +namespace paddle { +namespace framework { +namespace details { + +DataBalanceOpHandle::DataBalanceOpHandle( + const std::vector &local_scopes, + const std::vector &places) + : local_scopes_(local_scopes), places_(places) {} + +std::string DataBalanceOpHandle::Name() const { return "data balance"; } + +std::vector> DataBalanceOpHandle::GetBalancePlan( + const std::vector &device_sizes) { + int device_num = device_sizes.size(); + int total_size = 0; + int empty_num = 0; + std::vector> size_device_vec; + size_device_vec.reserve(device_num); + for (int i = 0; i < device_num; ++i) { + if (device_sizes[i] == 0) { + ++empty_num; + } + total_size += device_sizes[i]; + size_device_vec.push_back({{device_sizes[i], i}}); + } + std::vector> res; + if (empty_num == 0) { + // No need to do data balance. + return res; + } + if (total_size < device_num) { + // No enough data. + PADDLE_THROW("There is no next data."); + } + std::sort(size_device_vec.begin(), size_device_vec.end(), + [](const std::array &a, const std::array &b) { + return a[0] > b[0]; + }); + int expected_device_size = total_size / device_num; + int src_idx = 0; + for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) { + if (size_device_vec[src_idx][0] <= expected_device_size) { + ++src_idx; + PADDLE_ENFORCE_LT(src_idx, device_num - empty_num); + } + size_device_vec[src_idx][0] -= expected_device_size; + size_device_vec[dst_idx][0] += expected_device_size; + res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1], + expected_device_size}}); + } + return res; +} + +void DataBalanceOpHandle::RunImpl() { + if (places_.size() == 1) { + return; + } + auto in_var_handles = DynamicCast(inputs_); + auto out_var_handles = DynamicCast(outputs_); + PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); + int data_num = in_var_handles.size() / places_.size(); + WaitInputVarGenerated(); + + std::vector> lod_tensors; + std::vector device_sizes; + for (int i = 0; i < static_cast(in_var_handles.size()); ++i) { + PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_, + "The name of input and output should be equal."); + int place_idx = i / data_num; + int data_idx = i % data_num; + auto *local_scope = + local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get(); + auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name_); + PADDLE_ENFORCE(tensor_var->IsType()); + auto *tensor = tensor_var->GetMutable(); + PADDLE_ENFORCE(places_[place_idx] == tensor->place()); + lod_tensors[data_idx].push_back(tensor); + int ins_size = + tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements(); + if (data_idx == 0) { + device_sizes.emplace_back(ins_size); + } else { + PADDLE_ENFORCE_EQ(ins_size, device_sizes.at(place_idx)); + } + } + const auto &balance_plan = GetBalancePlan(device_sizes); + + for (const auto &trans : balance_plan) { + for (int data_idx = 0; data_idx < data_num; ++data_idx) { + LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]]; + LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]]; + int trans_ins_size = trans[2]; + LoD src_lod = src_tensor->lod(); + int src_ins_size = + src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements(); + int cut_point = src_ins_size - trans_ins_size; + if (!src_lod.empty()) { + for (auto &level : src_lod) { + cut_point = level[cut_point]; + } + } + TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]), + dst_tensor->place(), dst_tensor); + src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point)); + if (!src_lod.empty()) { + dst_tensor->set_lod(SliceInLevel( + src_lod, 0, src_ins_size - trans_ins_size, src_ins_size)); + src_tensor->set_lod( + SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size)); + } + } + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..00bc4837d66f40e06b332dff92429d85f3b2acfc --- /dev/null +++ b/paddle/fluid/framework/details/data_balance_op_handle.h @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace details { + +struct DataBalanceOpHandle : public OpHandleBase { + public: + DataBalanceOpHandle(const std::vector &local_scopes, + const std::vector &places); + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override { return false; }; + + protected: + void RunImpl() override; + + private: + // std::vector<(src_dev_id, dst_dev_id, trans_size)> + std::vector> GetBalancePlan( + const std::vector &batch_size_per_device); + + const std::vector &local_scopes_; + const std::vector &places_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index e7063fb0423cbe8ad79e46ee8bcb0dcca79b1067..357f6ff5dc294a7bc6edd2d61bdd61ee580ac2d2 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/data_balance_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/rpc_op_handle.h" @@ -217,6 +218,11 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( // gradients. CreateComputationalOps(&result, *op, places_.size()); + if (op->Type() == "read") { + const auto &data_var_names = op->Output("Out"); + InsertDataBalanceOp(&result, data_var_names); + } + if (!is_forwarding && places_.size() > 1) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. @@ -360,6 +366,24 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result, } } +void MultiDevSSAGraphBuilder::InsertDataBalanceOp( + SSAGraph *result, const std::vector &datas) const { + result->ops_.emplace_back(new DataBalanceOpHandle(local_scopes_, places_)); + auto *op_handle = result->ops_.back().get(); + for (size_t i = 0; i < places_.size(); ++i) { + auto &p = places_[i]; + SetCommunicationContext(op_handle, p); + for (const std::string &d_name : datas) { + auto &vars = result->vars_[i][d_name]; + PADDLE_ENFORCE(!vars.empty()); + op_handle->AddInput(vars.back().get()); + auto var = new VarHandle(vars.size(), i, d_name, p); + vars.emplace_back(var); + op_handle->AddOutput(var); + } + } +} + bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( const std::string &og, std::unordered_set *og_has_been_broadcast) const { @@ -509,7 +533,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]); // the variable name which contains .block means it was splited by // split_byref op - // so that we can balance the variable blocks to all the pserver instances. + // so that we can balance the variable blocks to all the pserver + // instances. if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce && op.InputArgumentNames()[0].find(".block") == std::string::npos) { op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames()); diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index 0b6347bf51dc1c347073a0fdcf4ddd91865d846d..a964e024885e56693224a6199e00ff30beaa1df4 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -101,6 +101,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { void InsertAllReduceOp(SSAGraph *result, const std::string &og) const; + void InsertDataBalanceOp(SSAGraph *result, + const std::vector &datas) const; + void CreateBroadcastOp(SSAGraph *result, const std::string &p_name, size_t src_dev_id) const; diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index d29d8ce1c561e45980d10c17c984ca2ed3b453f3..49672e11818266b6f0328206a9985ca6c9bd4287 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -68,7 +68,7 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { // only print first ten elements int64_t size = t.numel() < 10 ? t.numel() : 10; for (int64_t i = 0; i < size; ++i) { - if (t.type().hash_code() == typeid(float).hash_code()) { + if (t.type().hash_code() == typeid(float).hash_code()) { // NOLINT os << t.data()[i] << " "; } else if (t.type().hash_code() == typeid(int64_t).hash_code()) { os << t.data()[i] << " "; @@ -89,6 +89,7 @@ std::string LoDToString(const LoD &lod) { LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin, size_t elem_end) { PADDLE_ENFORCE_LT(level, in.size()); + PADDLE_ENFORCE_LT(elem_begin, elem_end); PADDLE_ENFORCE_LT(elem_end, in[level].size()); LoD res; diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc index 72a27d43584d55cd0859c63577ae85ff0f5fdfa8..8e9f91c185dda4729230bc9a89eb6441cbc6205b 100644 --- a/paddle/fluid/operators/read_op.cc +++ b/paddle/fluid/operators/read_op.cc @@ -66,9 +66,15 @@ class ReadOp : public framework::OperatorBase { std::vector out_arg_names = Outputs("Out"); std::vector ins; reader->ReadNext(&ins); - PADDLE_ENFORCE(!ins.empty(), "There is no next data."); + if (ins.empty()) { + ins.resize(out_arg_names.size()); + for (auto& tensor : ins) { + // data type is not important for subsequent DataBalanceOpHandle + tensor.mutable_data(framework::make_ddim({0}), dev_place); + } + } PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); - for (size_t i = 0; i < ins.size(); ++i) { + for (size_t i = 0; i < out_arg_names.size(); ++i) { auto* out = scope.FindVar(out_arg_names[i])->GetMutable(); out->ShareDataWith(ins[i]);