未验证 提交 486121d5 编写于 作者: Y Yancey 提交者: GitHub

Merge pull request #12120 from Yancey1989/fix_cpu_pe

fix bcast with parallel executor + cpu
...@@ -95,7 +95,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -95,7 +95,7 @@ ParallelExecutor::ParallelExecutor(
} }
if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
BCastParamsToGPUs(bcast_vars); BCastParamsToDevs(bcast_vars);
} }
// Startup Program has been run. All local scopes has correct parameters. // Startup Program has been run. All local scopes has correct parameters.
...@@ -131,7 +131,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -131,7 +131,7 @@ ParallelExecutor::ParallelExecutor(
member_->places_, std::move(member_->executor_))); member_->places_, std::move(member_->executor_)));
} }
void ParallelExecutor::BCastParamsToGPUs( void ParallelExecutor::BCastParamsToDevs(
const std::unordered_set<std::string> &vars) const { const std::unordered_set<std::string> &vars) const {
// the the initializing bcast, all vars would be bcast from device(0), // the the initializing bcast, all vars would be bcast from device(0),
// otherwise // otherwise
...@@ -202,7 +202,11 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -202,7 +202,11 @@ void ParallelExecutor::BCastParamsToGPUs(
#endif #endif
} else { } else {
platform::CPUPlace cpu; platform::CPUPlace cpu;
for (size_t i = 1; i < member_->places_.size(); ++i) { for (size_t i = 0; i < member_->places_.size(); ++i) {
if ((initializing && i == 0) ||
(!initializing && static_cast<int>(i) == var_dev_id))
continue;
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
auto *t = local_scope->Var(var)->GetMutable<LoDTensor>(); auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
t->Resize(dims); t->Resize(dims);
......
...@@ -66,7 +66,7 @@ class ParallelExecutor { ...@@ -66,7 +66,7 @@ class ParallelExecutor {
void Run(const std::vector<std::string> &fetch_tensors, void Run(const std::vector<std::string> &fetch_tensors,
const std::string &fetched_var_name); const std::string &fetched_var_name);
void BCastParamsToGPUs(const std::unordered_set<std::string> &vars) const; void BCastParamsToDevs(const std::unordered_set<std::string> &vars) const;
private: private:
ParallelExecutorPrivate *member_; ParallelExecutorPrivate *member_;
......
...@@ -665,7 +665,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -665,7 +665,7 @@ All parameter, weight, gradient are variables in Paddle.
const std::string &, Scope *, std::vector<Scope *> &, const std::string &, Scope *, std::vector<Scope *> &,
const ExecutionStrategy &, const BuildStrategy &, size_t, const ExecutionStrategy &, const BuildStrategy &, size_t,
size_t>()) size_t>())
.def("bcast_params", &ParallelExecutor::BCastParamsToGPUs) .def("bcast_params", &ParallelExecutor::BCastParamsToDevs)
// NOTE: even we return a vec<Scope*>* to Python use reference policy. // NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element // We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope* // of vec<Scope*> will be freed by Python GC. We can only return Scope*
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册