diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 6a6b497fa897e3882995688bf36704b1d77ea962..49e65e4a544098022d5efeb687504b0bfbed3fd8 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -95,6 +95,11 @@ std::unique_ptr BuildStrategy::Apply( for (std::shared_ptr &pass : pass_builder_->AllPasses()) { if (pass->Type() == "multi_devices_pass") { + pass->Erase("enable_sequence_execution"); + if (enable_sequence_execution_) { + pass->Set("enable_sequence_execution", new bool(true)); + } + pass->Erase("places"); pass->SetNotOwned>("places", &places); pass->Erase("loss_var_name"); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 02c4bea16916d58a6d0fce8918f8fceb9ff9356e..cc203a6412a9efffcb862eb0242770608f333768 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -69,6 +69,8 @@ struct BuildStrategy { bool enable_data_balance_{false}; + bool enable_sequence_execution_{false}; + // User normally doesn't need to call this API. // The PassBuilder allows for more customized insert, remove of passes // from python side. diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index b6282debdb4eb6b1f29c39e54ac4f3e2296838da..95f114056d5a45c9aa6aaaab0065f87576856080 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -20,11 +20,12 @@ namespace paddle { namespace framework { namespace details { ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, - platform::Place place) + platform::Place place, size_t place_id) : OpHandleBase(node), op_(framework::OpRegistry::CreateOp(*node->Op())), scope_(scope), - place_(place) {} + place_(place), + place_id_(place_id) {} void ComputationOpHandle::RunImpl() { WaitInputVarGenerated(place_); diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index e98f1ab148db083ac63a1afd43e334fbfae62539..0cf112bc4b84f921be243698dcbbce826faf9282 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -28,7 +28,8 @@ namespace framework { namespace details { struct ComputationOpHandle : public OpHandleBase { public: - ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place); + ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, + size_t place_id); std::string Name() const override; @@ -36,6 +37,10 @@ struct ComputationOpHandle : public OpHandleBase { const platform::Place &GetPlace() const { return place_; } + const OperatorBase &GetOp() const { return *op_; } + + size_t GetPlaceId() const { return place_id_; } + protected: void RunImpl() override; @@ -45,6 +50,7 @@ struct ComputationOpHandle : public OpHandleBase { std::unique_ptr op_; Scope *scope_; platform::Place place_; + size_t place_id_; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 134fcee826715672a6e021e9bf694bb771ebb830..4047bbcf8b7f4ba25365755b8bf9951e985e4b52 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include #include +#include #include #include #include @@ -237,8 +238,24 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( // some optimizer ops might not depend on any nodes), we manually move all // optimizer nodes after last backward nodes. // However, the assumption by SSAGraphBuilder should be relaxed in the future. -std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { - std::vector ret = ir::TopologySortOperations(graph); +std::vector SortOpsAndDelayOptimizeOp( + const ir::Graph &graph, bool enable_sequence_execution = false) { + std::vector ret; + if (enable_sequence_execution) { + VLOG(10) << "sequential execution mode is enabled"; + for (auto *node : graph.Nodes()) { + if (node->IsOp()) { + ret.push_back(node); + } + } + std::sort(ret.begin(), ret.end(), + [](const ir::Node *n1, const ir::Node *n2) { + return n1->id() < n2->id(); + }); + } else { + ret = ir::TopologySortOperations(graph); + } + size_t last_backward = 0; for (size_t i = 0; i < ret.size(); ++i) { if (boost::get( @@ -287,7 +304,10 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( std::unique_ptr graph) const { Init(); // Give the topology sort order and rebuild the graph structure. - std::vector sorted_ops = SortOpsAndDelayOptimizeOp(*graph); + bool enable_sequence_execution = Has("enable_sequence_execution") && + Get("enable_sequence_execution"); + std::vector sorted_ops = + SortOpsAndDelayOptimizeOp(*graph, enable_sequence_execution); auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; @@ -443,6 +463,12 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } } } + + // Insert dependencies between computation_ops + if (enable_sequence_execution) { + InsertSequenceDependenciesBetweenComputationOps(graph.get()); + } + /* Dependency graph has been constructed. However, there are still data hazards need to be handled. @@ -457,6 +483,34 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( return graph; } +void MultiDevSSAGraphBuilder::InsertSequenceDependenciesBetweenComputationOps( + ir::Graph *graph) const { + auto &ops = graph->Get(kGraphOps); + // Use std::map instead of std::unordered_map for better log message + std::map> compute_ops; + for (auto &op : ops) { + auto *compute_op = dynamic_cast(op.get()); + if (compute_op == nullptr) continue; + compute_ops[compute_op->GetPlaceId()].push_back(compute_op); + } + + for (auto &pair : compute_ops) { + auto &ops = pair.second; + for (size_t i = 1; i < ops.size(); ++i) { + if (ops[i - 1]->Outputs().empty()) { + auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dep_var); + ops[i - 1]->AddOutput(dep_var); + } + ops[i]->AddInput(ops[i - 1]->Outputs().front()); + VLOG(10) << "sequential execution mode: device(" << pair.first + << ") insert dependency between " + << ops[i - 1]->GetOp().DebugString() << " -> " + << ops[i]->GetOp().DebugString(); + } + } +} + bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { PADDLE_ENFORCE(all_vars_.count(og) != 0); if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { @@ -513,7 +567,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, int dev_id) const { result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), - local_scopes_[dev_id], places_[dev_id])); + local_scopes_[dev_id], places_[dev_id], dev_id)); CreateOpHandleIOs(result, node, dev_id); } @@ -630,8 +684,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { auto p = places_[scope_idx]; auto s = local_scopes_[scope_idx]; - result->Get(kGraphOps).emplace_back( - new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p)); + result->Get(kGraphOps).emplace_back(new ComputationOpHandle( + result->CreateOpNode(node->Op()), s, p, scope_idx)); CreateOpHandleIOs(result, node, scope_idx); } } diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index cdf9f13cde608b546d17a1e53e0f6acea9e12566..6476a45d553a0f9617d42d50882ecae6a857e40e 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -86,6 +86,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass { void SetCommunicationContext(OpHandleBase *op_handle, const platform::Place &p) const; + void InsertSequenceDependenciesBetweenComputationOps(ir::Graph *graph) const; + mutable std::string loss_var_name_; mutable std::vector places_; mutable std::vector local_scopes_; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 295af1c5837d70c32b522cc47c8c3e12d5bd61c7..1abd9514b2c60474bf0d47869e4ab5a3d139ba79 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -694,6 +694,13 @@ All parameter, weight, gradient are variables in Paddle. "enable_data_balance", [](const BuildStrategy &self) { return self.enable_data_balance_; }, [](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; }) + .def_property("enable_sequence_execution", + [](const BuildStrategy &self) { + return self.enable_sequence_execution_; + }, + [](BuildStrategy &self, bool b) { + self.enable_sequence_execution_ = b; + }) .def_property("fuse_elewise_add_act_ops", [](const BuildStrategy &self) { return self.fuse_elewise_add_act_ops_;