未验证 提交 695dd371 编写于 作者: L lilong12 提交者: GitHub

Adjust pipeline optimizer for 3d parallelism (#31939)

* update, test=develop
上级 6f85e241
...@@ -71,37 +71,16 @@ void PipelineTrainer::CopyParameters(int microbatch_id, ...@@ -71,37 +71,16 @@ void PipelineTrainer::CopyParameters(int microbatch_id,
const ProgramDesc& program, const ProgramDesc& program,
const platform::Place& place) { const platform::Place& place) {
auto& global_block = program.Block(0); auto& global_block = program.Block(0);
std::map<std::string, int> param_map;
for (auto& var : global_block.AllVars()) {
if (var->Persistable()) {
param_map[var->Name()] = 1;
}
}
for (auto& var : global_block.AllVars()) { for (auto& var : global_block.AllVars()) {
bool is_param_grad = false;
size_t pos = 0;
// A magic suffix to indicate the merged gradient
std::string magicSuffix = std::string(kGradVarSuffix) + "@MERGED";
if ((pos = var->Name().find(magicSuffix)) != std::string::npos) {
auto prefix_name = var->Name().substr(0, pos);
if (param_map.find(prefix_name) != param_map.end()) {
is_param_grad = true;
}
}
if (var->Persistable() && microbatch_id == 0) { if (var->Persistable() && microbatch_id == 0) {
auto* ptr = root_scope_->Var(var->Name()); auto* ptr = root_scope_->Var(var->Name());
InitializeVariable(ptr, var->GetType()); InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create persistable var: " << var->Name() VLOG(5) << "Create persistable var: " << var->Name()
<< ", which pointer is " << ptr;
} else if (is_param_grad && microbatch_id == 0) {
auto* ptr = minibatch_scope_->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create grad for persistable var: " << var->Name()
<< ", which pointer is " << ptr; << ", which pointer is " << ptr;
} else if (!var->Persistable() && !is_param_grad) { } else if (!var->Persistable()) {
auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name()); auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name());
VLOG(3) << "Create variable " << var->Name() << " for microbatch " VLOG(5) << "Create variable " << var->Name() << " for microbatch "
<< microbatch_id << ", which pointer is " << ptr; << microbatch_id << ", which pointer is " << ptr;
InitializeVariable(ptr, var->GetType()); InitializeVariable(ptr, var->GetType());
} }
......
...@@ -106,6 +106,11 @@ class CollectiveHelper(object): ...@@ -106,6 +106,11 @@ class CollectiveHelper(object):
'use_calc_stream': True, 'use_calc_stream': True,
OP_ROLE_KEY: OpRole.Forward OP_ROLE_KEY: OpRole.Forward
}) })
block.append_op(
type='c_sync_calc_stream',
inputs={'X': sync_var},
outputs={'Out': sync_var},
attrs={OP_ROLE_KEY: OpRole.Forward})
block = program.global_block() block = program.global_block()
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
......
...@@ -171,6 +171,7 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -171,6 +171,7 @@ class PipelineOptimizer(MetaOptimizerBase):
program._pipeline_opt['ring_id'] = self.start_pipeline_ring_id program._pipeline_opt['ring_id'] = self.start_pipeline_ring_id
program._pipeline_opt['micro_batch_size'] = self.micro_batch_size program._pipeline_opt['micro_batch_size'] = self.micro_batch_size
program._pipeline_opt['schedule_mode'] = self.schedule_mode program._pipeline_opt['schedule_mode'] = self.schedule_mode
program._pipeline_opt['use_sharding'] = False
optimize_ops, params_grads, prog_list, pp_pair, ring_map = self.wrapped_opt.minimize( optimize_ops, params_grads, prog_list, pp_pair, ring_map = self.wrapped_opt.minimize(
loss, startup_program, parameter_list, no_grad_set) loss, startup_program, parameter_list, no_grad_set)
self.startup_program = orig_startup_program._pipeline_opt[ self.startup_program = orig_startup_program._pipeline_opt[
...@@ -218,7 +219,6 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -218,7 +219,6 @@ class PipelineOptimizer(MetaOptimizerBase):
grad = None grad = None
processed_param_name = set() processed_param_name = set()
first_optimize_op_idx = None first_optimize_op_idx = None
add_sync_calc_stream = False
for idx, op in reversed(list(enumerate(block.ops))): for idx, op in reversed(list(enumerate(block.ops))):
if is_backward_op(op) and not first_optimize_op_idx: if is_backward_op(op) and not first_optimize_op_idx:
first_optimize_op_idx = idx + 1 first_optimize_op_idx = idx + 1
...@@ -242,15 +242,6 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -242,15 +242,6 @@ class PipelineOptimizer(MetaOptimizerBase):
origin_param = origin_block.vars[op_role_var[i]] origin_param = origin_block.vars[op_role_var[i]]
if origin_param.is_distributed: if origin_param.is_distributed:
continue continue
if not add_sync_calc_stream:
add_sync_calc_stream = True
block._insert_op(
first_optimize_op_idx + offset,
type='c_sync_calc_stream',
inputs={'X': grad},
outputs={'Out': grad},
attrs={OP_ROLE_KEY: OpRole.Optimize})
offset += 1
block._insert_op( block._insert_op(
first_optimize_op_idx + offset, first_optimize_op_idx + offset,
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册