未验证 提交 c3974d0e 编写于 作者: L lilong12 提交者: GitHub

[3D-parallel] Reformat pipeline parallel (#31786)

* update, test=develop
上级 01aa2526
...@@ -39,13 +39,13 @@ void SectionWorker::RunForward( ...@@ -39,13 +39,13 @@ void SectionWorker::RunForward(
int op_role = op->Attr<int>(std::string("op_role")); int op_role = op->Attr<int>(std::string("op_role"));
// We run op with op_role = kLRSched only for the first microbatch // We run op with op_role = kLRSched only for the first microbatch
// to avoid increasing the @LR_DECAY_STEP@ multiple times. // to avoid increasing the @LR_DECAY_STEP@ multiple times.
bool run_first_mbatch = op_role == static_cast<int>(OpRole::kForward) || bool run_first_mbatch = (op_role == static_cast<int>(OpRole::kForward)) ||
op_role == (static_cast<int>(OpRole::kForward) | (op_role == (static_cast<int>(OpRole::kForward) |
static_cast<int>(OpRole::kLoss)) || static_cast<int>(OpRole::kLoss))) ||
op_role == static_cast<int>(OpRole::kLRSched); (op_role == static_cast<int>(OpRole::kLRSched));
bool run_others = op_role == static_cast<int>(OpRole::kForward) || bool run_others = (op_role == static_cast<int>(OpRole::kForward)) ||
op_role == (static_cast<int>(OpRole::kForward) | (op_role == (static_cast<int>(OpRole::kForward) |
static_cast<int>(OpRole::kLoss)); static_cast<int>(OpRole::kLoss)));
if ((micro_id == 0 && run_first_mbatch) || (micro_id != 0 && run_others)) { if ((micro_id == 0 && run_first_mbatch) || (micro_id != 0 && run_others)) {
VLOG(3) << "Forward: running op " << op->Type() << " for micro-batch " VLOG(3) << "Forward: running op " << op->Type() << " for micro-batch "
<< micro_id; << micro_id;
...@@ -64,9 +64,9 @@ void SectionWorker::RunBackward( ...@@ -64,9 +64,9 @@ void SectionWorker::RunBackward(
&unused_vars_) { &unused_vars_) {
for (auto &op : ops_) { for (auto &op : ops_) {
int op_role = op->Attr<int>(std::string("op_role")); int op_role = op->Attr<int>(std::string("op_role"));
if (op_role == static_cast<int>(OpRole::kBackward) || if ((op_role == static_cast<int>(OpRole::kBackward)) ||
op_role == (static_cast<int>(OpRole::kBackward) | (op_role == (static_cast<int>(OpRole::kBackward) |
static_cast<int>(OpRole::kLoss))) { static_cast<int>(OpRole::kLoss)))) {
VLOG(3) << "Backward: running op " << op->Type() << " for micro-batch " VLOG(3) << "Backward: running op " << op->Type() << " for micro-batch "
<< micro_id; << micro_id;
op->Run(*microbatch_scopes_[micro_id], place_); op->Run(*microbatch_scopes_[micro_id], place_);
......
...@@ -47,7 +47,7 @@ def is_optimizer_op(op): ...@@ -47,7 +47,7 @@ def is_optimizer_op(op):
class CollectiveHelper(object): class CollectiveHelper(object):
def __init__(self, role_maker, nrings=1, wait_port='6174'): def __init__(self, role_maker, nrings=1, wait_port=True):
self.nrings = nrings self.nrings = nrings
self.wait_port = wait_port self.wait_port = wait_port
self.role_maker = role_maker self.role_maker = role_maker
...@@ -65,14 +65,48 @@ class CollectiveHelper(object): ...@@ -65,14 +65,48 @@ class CollectiveHelper(object):
self.role_maker._worker_index(), ring_id, self.wait_port) self.role_maker._worker_index(), ring_id, self.wait_port)
self._broadcast_params() self._broadcast_params()
def _init_communicator(self, program, current_endpoint, endpoints, rank, def _init_communicator(self,
ring_id, wait_port): program,
current_endpoint,
endpoints,
rank,
ring_id,
wait_port,
global_ring_id=None,
sync=True):
nranks = len(endpoints) nranks = len(endpoints)
other_endpoints = endpoints[:] other_endpoints = endpoints[:]
other_endpoints.remove(current_endpoint) other_endpoints.remove(current_endpoint)
if rank == 0 and wait_port: if rank == 0 and wait_port:
wait_server_ready(other_endpoints) wait_server_ready(other_endpoints)
def _add_sync_by_allreduce(block):
sync_var = block.create_var(
name=unique_name.generate('sync_var'),
dtype=core.VarDesc.VarType.INT32,
persistable=False,
stop_gradient=True)
block.append_op(
type='fill_constant',
inputs={},
outputs={'Out': [sync_var]},
attrs={
'shape': [1],
'dtype': sync_var.dtype,
'value': 1,
'force_cpu': False,
OP_ROLE_KEY: OpRole.Forward
})
block.append_op(
type='c_allreduce_sum',
inputs={'X': [sync_var]},
outputs={'Out': [sync_var]},
attrs={
'ring_id': global_ring_id,
'use_calc_stream': True,
OP_ROLE_KEY: OpRole.Forward
})
block = program.global_block() block = program.global_block()
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
comm_id_var = block.create_var( comm_id_var = block.create_var(
...@@ -128,6 +162,7 @@ class CollectiveHelper(object): ...@@ -128,6 +162,7 @@ class CollectiveHelper(object):
raise ValueError( raise ValueError(
"comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu." "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
) )
if sync: _add_sync_by_allreduce(block)
def _wait(self, current_endpoint, endpoints): def _wait(self, current_endpoint, endpoints):
assert (self.wait_port) assert (self.wait_port)
......
...@@ -19,130 +19,21 @@ from paddle.fluid import core, unique_name ...@@ -19,130 +19,21 @@ from paddle.fluid import core, unique_name
from ..base.private_helper_function import wait_server_ready from ..base.private_helper_function import wait_server_ready
from paddle.fluid.optimizer import PipelineOptimizer as PO from paddle.fluid.optimizer import PipelineOptimizer as PO
from .meta_optimizer_base import MetaOptimizerBase from .meta_optimizer_base import MetaOptimizerBase
from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_loss_grad_op, is_backward_op, is_optimizer_op
def _get_node_num(endpoints):
ss = set()
for ep in endpoints:
ip = ep.split(":")[0].strip()
if ip not in ss:
ss.add(ip)
return len(ss)
class PipelineHelper(object):
def __init__(self, role_maker, wait_port='6174'):
self.wait_port = wait_port
self.role_maker = role_maker
def update_startup_program(self,
startup_program=None,
inner_parallelism=None):
self.startup_program = startup_program
nranks = self.role_maker._worker_num()
rank = self.role_maker._worker_index()
endpoints = self.role_maker._get_trainer_endpoints()
current_endpoint = endpoints[rank]
node_num = _get_node_num(endpoints)
assert nranks % node_num == 0
# Create ring 0 for all gpus in the same pipeline
if inner_parallelism > 1:
pipeline_rank = rank % inner_parallelism
pipeline_id = rank // inner_parallelism
start_index = pipeline_id * inner_parallelism
pipeline_endpoints = endpoints[start_index:start_index +
inner_parallelism]
self._init_communicator(self.startup_program, current_endpoint,
pipeline_endpoints, pipeline_rank, 0,
self.wait_port)
pipeline_num = len(endpoints) // inner_parallelism
if pipeline_num == 1: return
# Create rings for gpus with the same pipeline id for data parallel
eps = []
pipeline_rank = rank % inner_parallelism
ring_id = pipeline_rank + 1
for i in range(pipeline_num):
eps.append(endpoints[i * inner_parallelism + pipeline_rank])
# rank in a ring of gpus with the same pipeline id for data parallel
dp_rank = rank // inner_parallelism
self._init_communicator(self.startup_program, current_endpoint, eps,
dp_rank, ring_id, self.wait_port)
self._broadcast_params(ring_id)
def _init_communicator(self, program, current_endpoint, endpoints, rank,
ring_id, wait_port):
nranks = len(endpoints)
other_endpoints = endpoints[:]
other_endpoints.remove(current_endpoint)
if rank == 0 and wait_port:
wait_server_ready(other_endpoints)
block = program.global_block()
nccl_id_var = block.create_var(
name=unique_name.generate('nccl_id'),
persistable=True,
type=core.VarDesc.VarType.RAW)
block.append_op(
type='c_gen_nccl_id',
inputs={},
outputs={'Out': nccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints,
OP_ROLE_KEY: OpRole.Forward,
})
block.append_op(
type='c_comm_init',
inputs={'X': nccl_id_var},
outputs={},
attrs={
'nranks': nranks,
'rank': rank,
'ring_id': ring_id,
OP_ROLE_KEY: OpRole.Forward,
})
def _broadcast_params(self, ring_id):
block = self.startup_program.global_block()
for var_name in block.vars:
if "nccl_id" in var_name: continue
param = block.var(var_name)
if not param.persistable:
continue
block.append_op(
type='c_broadcast',
inputs={'X': param},
outputs={'Out': param},
attrs={
'ring_id': ring_id,
'root': 0,
OP_ROLE_KEY: OpRole.Forward
})
block.append_op(
type='c_sync_comm_stream',
inputs={'X': param},
outputs={'Out': param},
attrs={'ring_id': ring_id,
OP_ROLE_KEY: OpRole.Forward})
class PipelineOptimizer(MetaOptimizerBase): class PipelineOptimizer(MetaOptimizerBase):
def __init__(self, optimizer): def __init__(self, optimizer):
super(PipelineOptimizer, self).__init__(optimizer) super(PipelineOptimizer, self).__init__(optimizer)
self.inner_opt = optimizer self.inner_opt = optimizer
# we do not allow meta optimizer to be inner optimizer currently
self.meta_optimizers_white_list = [ self.meta_optimizers_white_list = [
"RecomputeOptimizer", "RecomputeOptimizer",
"AMPOptimizer", "AMPOptimizer",
] ]
self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ] self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
self.global_ring_id = 1
self.dp_ring_id = 2
self.start_pipeline_ring_id = 20 # Just a magic number
def _set_basic_info(self, loss, role_maker, user_defined_optimizer, def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
user_defined_strategy): user_defined_strategy):
...@@ -165,7 +56,11 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -165,7 +56,11 @@ class PipelineOptimizer(MetaOptimizerBase):
def _disable_strategy(self, dist_strategy): def _disable_strategy(self, dist_strategy):
dist_strategy.pipeline = False dist_strategy.pipeline = False
dist_strategy.pipeline_configs = {} dist_strategy.pipeline_configs = {
"micro_batch_size": 1,
"accumulate_steps": 1,
"schedule_mode": "1F1B",
}
def _enable_strategy(self, dist_strategy, context): def _enable_strategy(self, dist_strategy, context):
dist_strategy.pipeline = True dist_strategy.pipeline = True
...@@ -175,61 +70,134 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -175,61 +70,134 @@ class PipelineOptimizer(MetaOptimizerBase):
"schedule_mode": "1F1B", "schedule_mode": "1F1B",
} }
def _broadcast_params(self, ring_id):
block = self.startup_program.global_block()
param = None
for param in block.iter_parameters():
if param.is_distributed:
continue
block.append_op(
type='c_broadcast',
inputs={'X': param},
outputs={'Out': param},
attrs={
'ring_id': ring_id,
'root': 0,
OP_ROLE_KEY: OpRole.Forward
})
if not param: return # no parameter on this device
block.append_op(
type='c_sync_comm_stream',
inputs={'X': param},
outputs={'Out': param},
attrs={'ring_id': ring_id,
OP_ROLE_KEY: OpRole.Forward})
def _get_process_group_info(self):
# global ring info
self.global_endpoints = self.endpoints
self.global_rank = self.rank
self.global_nranks = self.nranks
# data parallel ring info
if self.pipeline_num > 1:
self.dp_rank = self.rank // self.inner_parallelism
self.dp_nranks = self.nranks // self.inner_parallelism
start_index = self.rank % self.inner_parallelism
self.dp_endpoints = [
self.endpoints[start_index + i * self.inner_parallelism]
for i in range(self.pipeline_num)
]
def _init_process_group(self, pipeline_pair, pipeline_ring_map):
self._get_process_group_info()
collective_helper = CollectiveHelper(self.role_maker, wait_port=False)
# Create global ring for all gpus (ring_id = 0)
collective_helper._init_communicator(
self.startup_program, self.current_endpoint, self.global_endpoints,
self.global_rank, self.global_ring_id, True, self.global_ring_id,
True)
# Create pipeline rings
if self.inner_parallelism > 1:
pipeline_id = self.rank // self.inner_parallelism
start_index = pipeline_id * self.inner_parallelism
for pair in pipeline_pair:
pair_key = pair[0] * 1000 + pair[1]
ring_id = pipeline_ring_map[pair_key]
assert ring_id >= self.start_pipeline_ring_id
first_node = pair[0] + start_index
second_node = pair[1] + start_index
if self.rank != first_node and self.rank != second_node:
continue
pipeline_endpoints = [
self.endpoints[first_node], self.endpoints[second_node]
]
pipeline_rank = 0 if self.rank == first_node else 1
pipeline_nranks = 2
collective_helper._init_communicator(
self.startup_program, self.current_endpoint,
pipeline_endpoints, pipeline_rank, ring_id, False,
self.global_ring_id, True)
# Create dp rings
if self.pipeline_num > 1:
collective_helper._init_communicator(
self.startup_program, self.current_endpoint, self.dp_endpoints,
self.dp_rank, self.dp_ring_id, True, self.global_ring_id, True)
self._broadcast_params(self.dp_ring_id)
def minimize_impl(self, def minimize_impl(self,
loss, loss,
startup_program=None, startup_program=None,
parameter_list=None, parameter_list=None,
no_grad_set=None): no_grad_set=None):
endpoints = self.role_maker._get_trainer_endpoints() self.endpoints = self.role_maker._get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker._worker_index()] self.current_endpoint = self.endpoints[self.role_maker._worker_index()]
self.wrapped_opt = PO(self.inner_opt,
num_microbatches=self.num_microbatches)
node_num = _get_node_num(endpoints)
gpus_per_node = len(endpoints) // node_num
self.startup_program = startup_program
if startup_program is None:
self.startup_program = fluid.default_startup_program()
self.rank = self.role_maker._worker_index() self.rank = self.role_maker._worker_index()
self.nranks = self.role_maker._worker_num() self.nranks = self.role_maker._worker_num()
assert self.nranks % node_num == 0
loss.block.program._pipeline_opt = dict() self.wrapped_opt = PO(self.inner_opt,
loss.block.program._pipeline_opt['local_rank'] = self.rank num_microbatches=self.num_microbatches)
loss.block.program._pipeline_opt[ orig_startup_program = startup_program if startup_program else fluid.default_startup_program(
'micro_batch_size'] = self.micro_batch_size )
loss.block.program._pipeline_opt['schedule_mode'] = self.schedule_mode block = loss.block
optimize_ops, params_grads, prog_list = self.wrapped_opt.minimize( program = block.program
program._pipeline_opt = dict()
program._pipeline_opt['local_rank'] = self.rank
program._pipeline_opt['global_ring_id'] = self.global_ring_id
program._pipeline_opt['ring_id'] = self.start_pipeline_ring_id
program._pipeline_opt['micro_batch_size'] = self.micro_batch_size
program._pipeline_opt['schedule_mode'] = self.schedule_mode
optimize_ops, params_grads, prog_list, pp_pair, ring_map = self.wrapped_opt.minimize(
loss, startup_program, parameter_list, no_grad_set) loss, startup_program, parameter_list, no_grad_set)
assert prog_list self.startup_program = orig_startup_program._pipeline_opt[
'startup_program']
self.main_program_list = prog_list self.inner_parallelism = program._pipeline_opt['inner_parallelism']
self.main_program = loss.block.program
self.inner_parallelism = loss.block.program._pipeline_opt[
'inner_parallelism']
assert self.nranks % self.inner_parallelism == 0 assert self.nranks % self.inner_parallelism == 0
assert prog_list
self.pipeline_num = len(self.endpoints) // self.inner_parallelism
pipeline_helper = PipelineHelper(self.role_maker) self._init_process_group(pp_pair, ring_map)
pipeline_helper.update_startup_program(
self.startup_program._pipeline_opt["startup_program"],
self.inner_parallelism)
pipeline_num = self.nranks // self.inner_parallelism self.main_program_list = prog_list
self._transpile_main_program(loss, pipeline_num, self.inner_parallelism) self.main_program = program
if self.pipeline_num > 1:
self._transpile_main_program(loss)
return optimize_ops, params_grads return optimize_ops, params_grads
def _transpile_main_program(self, loss, pipeline_num, inner_parallelism): def _transpile_main_program(self, loss):
if pipeline_num <= 1: return self._insert_loss_grad_ops(loss, self.pipeline_num)
self._insert_loss_grad_ops(loss, pipeline_num) self._insert_allreduce_ops(self.dp_ring_id)
for ring_id in range(1, inner_parallelism + 1):
self._insert_allreduce_ops(ring_id)
def _insert_loss_grad_ops(self, loss, pipeline_num): def _insert_loss_grad_ops(self, loss, pipeline_num):
""" """
In order to keep the learning rate consistent in different numbers of In order to keep the learning rate consistent in different numbers of
training workers, we scale the loss grad by the number of workers training workers, we scale the loss grad by the number of workers
""" """
block = self.main_program_list[-1]['program'].global_block() block = self.main_program_list[-1].global_block()
for idx, op in reversed(list(enumerate(block.ops))): for idx, op in reversed(list(enumerate(block.ops))):
if is_loss_grad_op(op): if is_loss_grad_op(op):
loss_grad_var = block.vars[op.output_arg_names[0]] loss_grad_var = block.vars[op.output_arg_names[0]]
...@@ -244,57 +212,53 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -244,57 +212,53 @@ class PipelineOptimizer(MetaOptimizerBase):
}) })
def _insert_allreduce_ops(self, ring_id): def _insert_allreduce_ops(self, ring_id):
block = self.main_program_list[ring_id - 1]['program'].global_block() block = self.main_program._pipeline_opt['section_program'].global_block(
)
origin_block = self.main_program.global_block() origin_block = self.main_program.global_block()
grad = None grad = None
processed_param_name = set() processed_param_name = set()
first_optimize_op_idx = None
add_sync_calc_stream = False
for idx, op in reversed(list(enumerate(block.ops))): for idx, op in reversed(list(enumerate(block.ops))):
if is_backward_op(op) and not first_optimize_op_idx:
first_optimize_op_idx = idx + 1
# no optimize phase
if first_optimize_op_idx == len(block.ops): return
if is_backward_op(op) and \ if is_backward_op(op) and \
OP_ROLE_VAR_KEY in op.attr_names: OP_ROLE_VAR_KEY in op.attr_names:
op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY] op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
if len(op_role_var) == 0: if len(op_role_var) == 0:
continue continue
assert len(op_role_var) % 2 == 0 assert len(op_role_var) % 2 == 0
offset = idx offset = 0
for i in range(0, len(op_role_var), 2): for i in range(0, len(op_role_var), 2):
param_name = op_role_var[i] param_name = op_role_var[i]
param = block.vars[op_role_var[i]] param = block.vars[op_role_var[i]]
if param_name in processed_param_name: continue if param_name in processed_param_name: continue
processed_param_name.add(param_name) processed_param_name.add(param_name)
grad = block.vars[op_role_var[i + 1]] grad_name = op_role_var[i + 1]
if not 'MERGED' in grad_name: grad_name += '@MERGED'
grad = block.vars[grad_name]
origin_param = origin_block.vars[op_role_var[i]] origin_param = origin_block.vars[op_role_var[i]]
if origin_param.is_distributed: if origin_param.is_distributed:
continue continue
if offset == idx: if not add_sync_calc_stream:
offset += 1 add_sync_calc_stream = True
block._insert_op( block._insert_op(
offset, first_optimize_op_idx + offset,
type='c_sync_calc_stream', type='c_sync_calc_stream',
inputs={'X': grad}, inputs={'X': grad},
outputs={'Out': grad}, outputs={'Out': grad},
attrs={OP_ROLE_KEY: OpRole.Backward}) attrs={OP_ROLE_KEY: OpRole.Optimize})
offset += 1 offset += 1
block._insert_op( block._insert_op(
offset, first_optimize_op_idx + offset,
type='c_allreduce_sum', type='c_allreduce_sum',
inputs={'X': grad}, inputs={'X': grad},
outputs={'Out': grad}, outputs={'Out': grad},
attrs={ attrs={
'ring_id': ring_id, 'ring_id': ring_id,
OP_ROLE_KEY: OpRole.Backward 'use_calc_stream': True,
OP_ROLE_KEY: OpRole.Optimize
}) })
if grad is None:
return
for idx, op in enumerate(block.ops):
if is_optimizer_op(op):
block._insert_op(
idx,
type='c_sync_comm_stream',
inputs={'X': grad},
outputs={'Out': grad},
attrs={'ring_id': ring_id,
OP_ROLE_KEY: OpRole.Backward})
break
...@@ -123,7 +123,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): ...@@ -123,7 +123,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
outputs={"Out": out_var}, outputs={"Out": out_var},
attrs={ attrs={
"in_dtype": in_var.dtype, "in_dtype": in_var.dtype,
"out_dtype": out_var.dtype "out_dtype": out_var.dtype,
"op_device": op.attr("op_device")
}) })
num_cast_ops += 1 num_cast_ops += 1
_rename_arg(op, in_var.name, out_var.name) _rename_arg(op, in_var.name, out_var.name)
...@@ -171,8 +172,11 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name, ...@@ -171,8 +172,11 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name,
type="cast", type="cast",
inputs={"X": target_var}, inputs={"X": target_var},
outputs={"Out": cast_var}, outputs={"Out": cast_var},
attrs={"in_dtype": target_var.dtype, attrs={
"out_dtype": cast_var.dtype}) "in_dtype": target_var.dtype,
"out_dtype": cast_var.dtype,
"op_device": op.attr("op_device")
})
num_cast_ops += 1 num_cast_ops += 1
op_var_rename_map[block.idx][target_var.name] = cast_var.name op_var_rename_map[block.idx][target_var.name] = cast_var.name
......
...@@ -427,7 +427,7 @@ class Section(DeviceWorker): ...@@ -427,7 +427,7 @@ class Section(DeviceWorker):
section_param.schedule_mode = schedule_mode section_param.schedule_mode = schedule_mode
cfg = section_param.section_config cfg = section_param.section_config
program = pipeline_opt["section_program"] program = pipeline_opt["section_program"]
cfg.program_desc.ParseFromString(program["program"]._get_desc() cfg.program_desc.ParseFromString(program._get_desc()
.serialize_to_string()) .serialize_to_string())
# TODO: why does not work # TODO: why does not work
# cfg.program_desc.CopyFrom(program.program._get_desc()) # cfg.program_desc.CopyFrom(program.program._get_desc())
......
...@@ -1458,7 +1458,7 @@ class Executor(object): ...@@ -1458,7 +1458,7 @@ class Executor(object):
dataset._prepare_to_run() dataset._prepare_to_run()
real_fetch_list = [] real_fetch_list = []
if program._pipeline_opt: if program._pipeline_opt:
real_program = program._pipeline_opt["section_program"]['program'] real_program = program._pipeline_opt["section_program"]
for fetch_var in fetch_list: for fetch_var in fetch_list:
if isinstance(fetch_var, Variable): if isinstance(fetch_var, Variable):
fetch_var_name = fetch_var.name fetch_var_name = fetch_var.name
...@@ -1467,13 +1467,20 @@ class Executor(object): ...@@ -1467,13 +1467,20 @@ class Executor(object):
if fetch_var_name in real_program.global_block().vars: if fetch_var_name in real_program.global_block().vars:
real_fetch_list.append(fetch_var) real_fetch_list.append(fetch_var)
program._pipeline_opt["section_program"][ program._pipeline_opt["section_program"] = self._add_feed_fetch_ops(
'program'] = self._add_feed_fetch_ops( program=program._pipeline_opt["section_program"],
program=program._pipeline_opt["section_program"]['program'],
feed=[], feed=[],
fetch_list=real_fetch_list, fetch_list=real_fetch_list,
feed_var_name='feed', feed_var_name='feed',
fetch_var_name='fetch') fetch_var_name='fetch')
main_block = program._pipeline_opt["section_program"].block(0)
for op in main_block.ops:
# set the op_role of fetch op to Optimize to avoid
# erase the fetched vars by gc for pipeline
if op.type == 'fetch':
op._set_attr(
'op_role',
core.op_proto_and_checker_maker.OpRole.Optimize)
fetch_list = None fetch_list = None
scope, trainer = self._prepare_trainer( scope, trainer = self._prepare_trainer(
......
此差异已折叠。
...@@ -66,12 +66,21 @@ def cnn_model(data): ...@@ -66,12 +66,21 @@ def cnn_model(data):
param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
with fluid.device_guard("gpu:1"):
predict = fluid.layers.fc( predict = fluid.layers.fc(
input=conv_pool_2, input=conv_pool_2,
size=SIZE, size=SIZE,
act="softmax", act="softmax",
param_attr=fluid.param_attr.ParamAttr( param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01))) initializer=fluid.initializer.Constant(value=0.01)))
# To cover @RENAMED@GRADIENT
predict2 = fluid.layers.fc(
input=conv_pool_1,
size=SIZE,
act="softmax",
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)))
predict += predict2
return predict return predict
...@@ -108,7 +117,10 @@ class TestDistMnist2x2(TestDistRunnerBase): ...@@ -108,7 +117,10 @@ class TestDistMnist2x2(TestDistRunnerBase):
bd = [steps_per_pass * p for p in passes] bd = [steps_per_pass * p for p in passes]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr) lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9) opt = fluid.optimizer.Momentum(
learning_rate=lr_val,
momentum=0.9,
grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
acc_steps = 2 # accumulated steps for pipeline acc_steps = 2 # accumulated steps for pipeline
if dist_strategy: if dist_strategy:
...@@ -120,6 +132,7 @@ class TestDistMnist2x2(TestDistRunnerBase): ...@@ -120,6 +132,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
fleet.init(is_collective=True) fleet.init(is_collective=True)
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.pipeline = True strategy.pipeline = True
strategy.amp = True
strategy.pipeline_configs = { strategy.pipeline_configs = {
'micro_batch_size': batch_size, 'micro_batch_size': batch_size,
'schedule_mode': '1F1B', 'schedule_mode': '1F1B',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册