未验证 提交 30568473 编写于 作者: Y Yan Xu 提交者: GitHub

fix broadcast on mp mode (#15951)

* fix broadcast with mp mode

* polish code test=develop

* fix bcast strategy test=develop

* fic cpplint test=develop

* fix py3 failed test=develop

* fix comment test=develop

* update comment test=develop
上级 e3c37bd5
...@@ -181,13 +181,14 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() { ...@@ -181,13 +181,14 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
return member_->local_scopes_; return member_->local_scopes_;
} }
ParallelExecutor::ParallelExecutor( ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const std::vector<platform::Place> &places, const std::vector<std::string> &bcast_vars,
const std::unordered_set<std::string> &bcast_vars, const std::string &loss_var_name,
const std::string &loss_var_name, Scope *scope, Scope *scope,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, const ExecutionStrategy &exec_strategy,
ir::Graph *graph) const BuildStrategy &build_strategy,
ir::Graph *graph)
: member_(new ParallelExecutorPrivate(places)) { : member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope; member_->global_scope_ = scope;
member_->use_cuda_ = exec_strategy.use_cuda_; member_->use_cuda_ = exec_strategy.use_cuda_;
...@@ -254,9 +255,23 @@ ParallelExecutor::ParallelExecutor( ...@@ -254,9 +255,23 @@ ParallelExecutor::ParallelExecutor(
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA");
#endif #endif
} }
if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { // broadcast parameters from the 0th device to others:
BCastParamsToDevices(bcast_vars); auto need_broadcast = [&]() -> bool {
if (build_strategy.num_trainers_ > 1) {
// 1. num_tariners would be grater than 1 for nccl distributed training.
return true;
} else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
// 2. Only one trainer process, but ParallelExecutor hold multiple
// devices.
return true;
}
return false;
};
if (need_broadcast()) {
BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
} }
// Startup Program has been run. All local scopes has correct parameters. // Startup Program has been run. All local scopes has correct parameters.
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
...@@ -338,7 +353,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -338,7 +353,7 @@ ParallelExecutor::ParallelExecutor(
} }
void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::BCastParamsToDevices(
const std::unordered_set<std::string> &vars) const { const std::vector<std::string> &vars, int trainer_id) const {
// the initializing bcast, all vars would be bcast from device(0). // the initializing bcast, all vars would be bcast from device(0).
for (auto &var : vars) { for (auto &var : vars) {
framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
...@@ -362,7 +377,7 @@ void ParallelExecutor::BCastParamsToDevices( ...@@ -362,7 +377,7 @@ void ParallelExecutor::BCastParamsToDevices(
auto place = member_->places_[i]; auto place = member_->places_[i];
void *buffer; void *buffer;
if (i == 0) { if (i == 0 && trainer_id == 0) {
buffer = const_cast<void *>(main_tensor.data<void>()); buffer = const_cast<void *>(main_tensor.data<void>());
} else { } else {
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
......
...@@ -14,9 +14,11 @@ limitations under the License. */ ...@@ -14,9 +14,11 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/build_strategy.h"
...@@ -45,7 +47,7 @@ class ParallelExecutor { ...@@ -45,7 +47,7 @@ class ParallelExecutor {
public: public:
explicit ParallelExecutor(const std::vector<platform::Place> &places, explicit ParallelExecutor(const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &bcast_vars, const std::vector<std::string> &bcast_vars,
const std::string &loss_var_name, Scope *scope, const std::string &loss_var_name, Scope *scope,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy, const ExecutionStrategy &exec_strategy,
...@@ -70,7 +72,10 @@ class ParallelExecutor { ...@@ -70,7 +72,10 @@ class ParallelExecutor {
const std::string &fetched_var_name); const std::string &fetched_var_name);
private: private:
void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const; // broadcast the parameters from the 0th device.
// trainer_id the trainer index in nccl distributed training.
void BCastParamsToDevices(const std::vector<std::string> &vars,
int trainer_id = 0) const;
bool EnableParallelGraphExecution(const ir::Graph &graph, bool EnableParallelGraphExecution(const ir::Graph &graph,
const ExecutionStrategy &exec_strategy, const ExecutionStrategy &exec_strategy,
const BuildStrategy &build_strategy) const; const BuildStrategy &build_strategy) const;
......
...@@ -1251,7 +1251,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1251,7 +1251,7 @@ All parameter, weight, gradient are variables in Paddle.
cannot be updated after being finalized.)DOC"); cannot be updated after being finalized.)DOC");
pe.def(py::init<const std::vector<platform::Place> &, pe.def(py::init<const std::vector<platform::Place> &,
const std::unordered_set<std::string> &, const std::string &, const std::vector<std::string> &, const std::string &,
Scope *, std::vector<Scope *> &, const ExecutionStrategy &, Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
const BuildStrategy &, ir::Graph *>()) const BuildStrategy &, ir::Graph *>())
// NOTE: even we return a vec<Scope*>* to Python use reference policy. // NOTE: even we return a vec<Scope*>* to Python use reference policy.
......
...@@ -230,13 +230,17 @@ class CompiledProgram(object): ...@@ -230,13 +230,17 @@ class CompiledProgram(object):
self._persistable_vars.append(cpt.to_text(node.name())) self._persistable_vars.append(cpt.to_text(node.name()))
places = list(map(_place_obj, self._places)) places = list(map(_place_obj, self._places))
# ParallelExecutor would broadcast all the parameters during initializing.
return core.ParallelExecutor(places, # The parameters of each process should be in the same ordered for the data-parallelism
set(self._persistable_vars), # distributed training to keep the broadcast correct.
cpt.to_text(self._loss_name) self._persistable_vars = list(set(self._persistable_vars))
if self._loss_name else six.u(''), scope, self._persistable_vars.sort()
self._local_scopes, self._exec_strategy,
self._build_strategy, self._graph) return core.ParallelExecutor(
places, self._persistable_vars,
cpt.to_text(self._loss_name)
if self._loss_name else six.u(''), self._scope, self._local_scopes,
self._exec_strategy, self._build_strategy, self._graph)
def _compile_inference(self): def _compile_inference(self):
return core.create_paddle_predictor(self._infer_config) return core.create_paddle_predictor(self._infer_config)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册