未验证 提交 3d96601b 编写于 作者: L lilong12 提交者: GitHub

modify pipeline optimizer to only support the mode of sync pipeline training (#25065)

* modify pipeline optimizer, test=develop
上级 b555378e
paddle.fluid.optimizer.PipelineOptimizer (paddle.fluid.optimizer.PipelineOptimizer, ('document', '2e55a29dbeb874934f7a1a1af3a22b8c'))
paddle.fluid.optimizer.PipelineOptimizer.__init__ (ArgSpec(args=['self', 'optimizer', 'num_microbatches', 'start_cpu_core_id'], varargs=None, keywords=None, defaults=(1, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
...@@ -3633,30 +3633,15 @@ class PipelineOptimizer(object): ...@@ -3633,30 +3633,15 @@ class PipelineOptimizer(object):
""" """
:api_attr: Static Graph :api_attr: Static Graph
Pipeline Optimizer Pipeline Optimizer: Make a program to run as pipeline, that is splitting a
program into multiple sections (sub-programs) and each section run on a
Train with pipeline mode. The program will be split by cut_list. device to enable the training of large scale models and the use of
heterogeneous devices. Meanwhile, all sections run in the stype of pipeline.
If the len of cut_list is k, then the whole program (including \
backward part) will be split to 2*k-1 sections.
So the length of place_list and concurrency_list must be also 2*k-1.
Note: Though the asynchronous mode is applied in pipeline training to speed up, \
the final performance depends on the training progress of each pipeline heavily.
And we will try the synchronous mode in the future.
Args: Args:
optimizer (Optimizer): The based optimizer, such as SGD. optimizer (Optimizer): The optimizer to use, such as SGD.
cut_list (list of Variable list): The cut variable of the main_program. num_microbatches (int): Number of microbatches. [Optional. Default:1].
place_list (list of Place): The place where the section will run on. start_cpu_core_id (int): The first cpu core id to use. [Optional. Default:0].
concurrency_list (list of int): The concurrency degree.
queue_size (int): Each section will consume scopes from its in-scope queue
and produce scopes to out-scope queue. And this parameter
specify the scope queue size. [Optional. Default: 30].
sync_steps (int): The synchronization steps between different cards. [Optional. Default: 1].
start_cpu_core_id (int): specify the first cpu core id. [Optional. Default:0].
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -3664,271 +3649,835 @@ class PipelineOptimizer(object): ...@@ -3664,271 +3649,835 @@ class PipelineOptimizer(object):
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
with fluid.device_guard("gpu:0"):
x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0) x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0) y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0)
data_loader = fluid.io.DataLoader.from_generator(
feed_list=[x, y],
capacity=64,
use_double_buffer=True,
iterable=False)
emb_x = layers.embedding(input=x, param_attr=fluid.ParamAttr(name="embx"), size=[10,2], is_sparse=False) emb_x = layers.embedding(input=x, param_attr=fluid.ParamAttr(name="embx"), size=[10,2], is_sparse=False)
emb_y = layers.embedding(input=y, param_attr=fluid.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False) emb_y = layers.embedding(input=y, param_attr=fluid.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False)
with fluid.device_guard("gpu:1"):
concat = layers.concat([emb_x, emb_y], axis=1) concat = layers.concat([emb_x, emb_y], axis=1)
fc = layers.fc(input=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) fc = layers.fc(input=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False)
loss = layers.reduce_mean(fc) loss = layers.reduce_mean(fc)
optimizer = fluid.optimizer.SGD(learning_rate=0.5) optimizer = fluid.optimizer.SGD(learning_rate=0.5)
optimizer = fluid.optimizer.PipelineOptimizer(optimizer, optimizer = fluid.optimizer.PipelineOptimizer(optimizer)
cut_list=[[emb_x, emb_y], [loss]],
place_list=[fluid.CPUPlace(), fluid.CUDAPlace(0), fluid.CPUPlace()],
concurrency_list=[1, 1, 4],
queue_size=2,
sync_steps=1,
)
optimizer.minimize(loss) optimizer.minimize(loss)
place = fluid.CPUPlace()
def train_reader():
for _ in range(4):
x = np.random.random(size=[1]).astype('int64')
y = np.random.random(size=[1]).astype('int64')
yield x, y
data_loader.set_sample_generator(train_reader, batch_size=1)
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
batch_size = 1
filelist = [] # you should set your own filelist, e.g. filelist = ["dataA.txt"] filelist = [] # you should set your own filelist, e.g. filelist = ["dataA.txt"]
dataset = fluid.DatasetFactory().create_dataset("FileInstantDataset") dataset = fluid.DatasetFactory().create_dataset("FileInstantDataset")
dataset.set_use_var([x,y]) dataset.set_use_var([x,y])
dataset.set_batch_size(batch_size) dataset.set_batch_size(batch_size)
dataset.set_filelist(filelist) dataset.set_filelist(filelist)
data_loader.start()
exe.train_from_dataset( exe.train_from_dataset(
fluid.default_main_program(), fluid.default_main_program(),
dataset, dataset)
thread=2, data_loader.reset()
debug=False,
fetch_list=[],
fetch_info=[],
print_period=1)
""" """
def __init__(self, def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
optimizer,
cut_list=None,
place_list=None,
concurrency_list=None,
queue_size=30,
sync_steps=1,
start_cpu_core_id=0):
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
raise Exception("In dygraph, don't support PipelineOptimizer.") raise Exception("In dygraph, don't support PipelineOptimizer.")
# TODO: check properties if not isinstance(optimizer, Optimizer):
raise ValueError("The 'optimizer' parameter for "
"PipelineOptimizer must be an instance of "
"Optimizer, but the given type is {}.".format(
type(optimizer)))
self._optimizer = optimizer self._optimizer = optimizer
self._cut_list = cut_list assert num_microbatches >= 1, (
self._place_list = place_list "num_microbatches must be a positive value.")
self._concurrency_list = concurrency_list self._num_microbatches = num_microbatches
self._queue_size = queue_size assert start_cpu_core_id >= 0, (
self._sync_steps = sync_steps "start_cpu_core_id must be greater than or equal to 0.")
self._start_cpu_core_id = start_cpu_core_id self._start_cpu_core_id = start_cpu_core_id
self._place_list = None
op_maker = core.op_proto_and_checker_maker
self._op_role = op_maker.OpRole
self._op_role_key = op_maker.kOpRoleAttrName()
self._op_role_var_key = op_maker.kOpRoleVarAttrName()
self._op_device_key = op_maker.kOpDeviceAttrName()
self._param_device_map = dict()
def _create_vars(self, block, main_program): def _create_vars(self, block, main_program):
# Create vars for block, copied from main_program's global block
used_var_set = set() used_var_set = set()
for op_idx in range(block.desc.op_size()): for op_idx in range(block.desc.op_size()):
op_desc = block.desc.op(op_idx) op_desc = block.desc.op(op_idx)
vars = op_desc.input_arg_names() + op_desc.output_arg_names() vars = op_desc.input_arg_names() + op_desc.output_arg_names()
for var in vars: for var in vars:
if var in used_var_set: # a var whose name contains "blocking_queue"
# only exists in startup program
if var in used_var_set or "_blocking_queue" in var:
continue continue
used_var_set.add(var) used_var_set.add(var)
source_var = main_program.block(0).var(str(var)) source_var = main_program.block(0).var(str(var))
if source_var.type == core.VarDesc.VarType.READER:
block.create_var(name=var, type=core.VarDesc.VarType.READER)
else:
block._clone_variable(source_var, False) block._clone_variable(source_var, False)
def _extract_section_opt_ops(self, ops, cut_point_name): def _is_loss_grad_op(self, op):
if self._op_role_key not in op.attr_names:
return False
op_role = int(op.all_attrs()[self._op_role_key])
return op_role & int(self._op_role.Backward) and op_role & int(
self._op_role.Loss)
def _is_backward_op(self, op):
return self._op_role_key in op.attr_names and int(op.all_attrs()[
self._op_role_key]) & int(self._op_role.Backward)
def _is_optimize_op(self, op):
return self._op_role_key in op.attr_names and int(op.all_attrs()[
self._op_role_key]) & int(self._op_role.Optimize)
def _is_update_op(self, op):
return 'Param' in op.input_names and 'Grad' in op.input_names and (
"LearningRate" in op.input_names)
def _split_program(self, main_program):
""" """
Extract opt ops in the given section Split a program into sections according to devices that ops run on.
Args:
main_program (Program): the main program
""" """
output_names = set(cut_point_name) programs = []
relevant_op_flags = [True] * len(ops) # Map from device to its corresponding section program info
for i, op in reversed(list(enumerate(ops))): device_program_map = dict()
if _some_in_set_(op.desc.output_arg_names(), output_names): block = main_program.block(0)
for name in op.desc.input_arg_names():
output_names.add(name) for op in block.ops:
else: device = op.attr(self._op_device_key)
relevant_op_flags[i] = False
op_path = [ops[i] for i in range(len(ops)) if relevant_op_flags[i]] if device not in device_program_map:
return op_path program = {"program": Program()}
device_program_map[device] = program
program = device_program_map[device]
op_desc = op.desc
ap_op = program["program"].block(0).desc.append_op()
ap_op.copy_from(op_desc)
for key in sorted(device_program_map.keys()):
program = device_program_map[key]
program['program']._sync_with_cpp()
programs.append(program)
return programs
def _find_input_output(self, ops, name, is_forward=True): def _find_post_op(self, ops, cur_op, var_name):
""" """
Find the inputs or outputs of a section Find the real post op that has variable named var_name as input.
Args:
ops (list): A list of ops.
cur_op (Operator): Current operator which has variable named
var_name as output.
var_name (string): Variable name.
""" """
all_set = set() post_op = []
part_set = set() before = True
for op in ops: for op in ops:
if is_forward: if op == cur_op:
part_set.update(op.desc.output_arg_names()) before = False
else: continue
part_set.update(op.desc.input_arg_names()) if before:
all_set.update(op.desc.output_arg_names()) continue
all_set.update(op.desc.input_arg_names()) for in_var_name in op.input_arg_names:
return all_set - part_set if in_var_name == var_name:
post_op.append(op)
if post_op:
if not len(post_op) == 1:
raise ValueError("Each op can only have one post op.")
return post_op[0]
return None
def _find_persistable_vars(self, ops, whole_parameters): def _find_real_prev_op(self, ops, cur_op, var_name):
""" """
find the persistable input vars in current section Find the real previous op that outputs variable named var_name.
Args:
ops (list): A list of ops.
cur_op (Operator): Current operator which has variable named
var_name as input.
var_name (string): Variable name.
""" """
res = set() prev_op = []
for op in ops: for op in ops:
vars = op.desc.input_arg_names() if op == cur_op:
for var in vars: break
if var in whole_parameters: for out_var_name in op.output_arg_names:
res.add(var) if out_var_name == var_name:
return res prev_op.append(op)
if prev_op:
# A op may have more than one prev op,
# e.g., for 'learning_rate', there may be multiple ops have it as
# output.
return prev_op[-1]
return None
def _is_opt_role_op(self, op): def _rename_arg(self, op, old_name, new_name):
op_maker = core.op_proto_and_checker_maker op_desc = op.desc
optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize if isinstance(op_desc, tuple):
if op_maker.kOpRoleAttrName() in op.attr_names and \ op_desc = op_desc[0]
int(op.all_attrs()[op_maker.kOpRoleAttrName()]) & int(optimize_role) != 0: op_desc._rename_input(old_name, new_name)
return True op_desc._rename_output(old_name, new_name)
return False
def _is_lr_role_op(self, op): def _create_var(self, block, ref_var, name):
op_maker = core.op_proto_and_checker_maker """
optimize_role = core.op_proto_and_checker_maker.OpRole.LRSched Create a new var for block, which has the same type,
if op_maker.kOpRoleAttrName() in op.attr_names and \ shape and dtype as ref_var, then rename it with the
int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role): name `name`.
return True """
return False new_var = block.create_var(
name=name,
shape=ref_var.shape,
dtype=ref_var.dtype,
type=ref_var.type,
lod_level=ref_var.lod_level,
persistable=False,
is_data=False,
need_check_feed=ref_var.desc.need_check_feed())
return new_var
def _extract_section_ops(self, ops, cut_point_name): def _get_data_var_info(self, block):
""" """
Extract ops in the given section Get all vars whose is_data attribute are true and then rename them.
For PipelineTrainer, all data vars are binded to
minibatch scope, so we have to feed them to the microbatch
to avoid conflicts. The vars feeded to microbatch have to
be renamed.
""" """
output_names = set(cut_point_name) # A map from var name to the renamed name.
relevant_op_flags = [True] * len(ops) raw_name_new_name_map = dict()
for i, op in reversed(list(enumerate(ops))): # Because we will create vars in block, it is more safe
if not self._is_opt_role_op(op) and _some_in_set_( # to get all var_names before iteration.
op.desc.output_arg_names(), output_names): var_names = list(block.vars.keys())
for name in op.desc.input_arg_names(): for var_name in var_names:
output_names.add(name) var = block.var(var_name)
elif op.desc.type() == "print" and op.desc.input_arg_names()[ if not var.is_data:
0] in output_names:
continue continue
else: assert var_name not in raw_name_new_name_map, (
relevant_op_flags[i] = False "{} has already been processed.".format(var_name))
new_name = unique_name.generate(var_name)
raw_name_new_name_map[var_name] = new_name
new_var = self._create_var(block, var, new_name)
new_var.is_data = False
# map of data to devices that that data on
data_devices_map = dict()
for op in block.ops:
dev_spec = op.attr(self._op_device_key)
for var_name in op.input_arg_names:
if var_name not in raw_name_new_name_map:
continue
if not var_name in data_devices_map:
data_devices_map[var_name] = []
if not dev_spec in data_devices_map[var_name]:
data_devices_map[var_name].append(dev_spec)
new_name = raw_name_new_name_map[var_name]
#self._rename_arg(op, var_name, new_name)
return data_devices_map, raw_name_new_name_map
def _rename_var_in_block(self, block, raw_name_new_name_map):
"""
Rename vars whose names in raw_name_new_name_map to the corresponding
new names.
"""
for op in block.ops:
if op.type == "enqueue" or op.type == "dequeue":
continue
for var_name in op.input_arg_names:
if var_name in raw_name_new_name_map:
new_name = raw_name_new_name_map[var_name]
self._rename_arg(op, var_name, new_name)
op_path = [ops[i] for i in range(len(ops)) if relevant_op_flags[i]] def _insert_enq_deq_for_data_var(self, main_block, programs, startup,
return op_path devices):
"""
Insert enqueue and dequeue ops for data var
def _find_section_opt(self, ops, params): Args:
res = self._extract_section_opt_ops(ops, params) main_block (Block): Global block for main program
return res programs (dict): Dictionary for section params
startup (Program): Startup program
devices (list): List of devices in the format (dev:dev_index)
"""
main_program = main_block.program
data_devices_map, raw_name_new_name_map = self._get_data_var_info(
main_block)
first_prog = programs[0]['program']
first_block = first_prog.block(0)
enqueue_index = 0
if first_block.ops[0].type == "create_py_reader" or (
first_block.ops[1].type == "create_py_reader"):
for op in first_block.ops:
if op.type == "read":
enqueue_index += 1
break
enqueue_index += 1
first_dev_spec = devices[0]
for var_name in data_devices_map.keys():
for device in data_devices_map[var_name]:
# step1: generate queue for each pair of data var and device
# that that data on
queue_name = var_name + "_blocking_queue"
queue_name = unique_name.generate(queue_name)
queue_var = startup.block(0).create_var(
name=queue_name,
persistable=True,
type=core.VarDesc.VarType.RAW)
startup.block(0).append_op(
type='queue_generator',
attrs={
'names': [queue_name],
'capacity': self._num_microbatches
})
main_var = main_block.var(var_name)
assert main_var.is_data
if not var_name in first_block.vars:
self._create_var(first_block, main_var, var_name)
first_block._insert_op(
index=enqueue_index,
type='enqueue',
inputs={'X': first_block.var(var_name)},
attrs={
'queue_name': queue_name,
self._op_device_key: first_dev_spec,
self._op_role_key: self._op_role.Forward
})
# Get the device that that data on
assert device in devices
prog_index = devices.index(device)
prog = programs[prog_index]['program']
block = prog.block(0)
index = 0
if device == first_dev_spec:
index = enqueue_index + 1
new_name = raw_name_new_name_map[var_name]
source_var = main_program.block(0).var(var_name)
new_var = self._create_var(block, source_var, new_name)
block._insert_op(
index=index,
type='dequeue',
outputs={'Out': [new_var]},
attrs={
self._op_device_key: device,
self._op_role_key: self._op_role.Forward,
'queue_name': queue_name,
})
self._rename_var_in_block(block, raw_name_new_name_map)
def _split_program(self, main_program, cut_list): def _strip_grad_suffix(self, name):
programs = [] """
block = main_program.block(0) Strip the grad suffix from the given variable name
whole_parameters = [e.name for e in block.all_parameters()] """
cut_var_names = [] pos = name.find(core.grad_var_suffix())
cut_len = len(cut_list) return name[:pos] if pos != -1 else name
sec_params = []
for i, cut_vars in enumerate(cut_list[:-1]): def _append_grad_suffix(self, name):
cut_var_names.append([cut_var.name for cut_var in cut_vars]) """
for i, cut_vars in reversed(list(enumerate(cut_list[:-1]))): Append grad suffix to the given variable name
cut_var_names.append( """
[_append_grad_suffix_(cut_var.name) for cut_var in cut_vars]) return name + core.grad_var_suffix()
if i == 0:
cut_var_names[-1] += [var.name for var in cut_list[-1]] def _update_param_device_map(self, params_grads, block):
ops = block.ops[:] for param_grad in params_grads:
for i, cut_vars in enumerate(cut_var_names): if not param_grad[0].trainable: continue
program = { param_name = param_grad[0].name
"program": Program(), ops = block.ops
"input_set": set(),
"output_set": set()
}
cur_ops = self._extract_section_ops(ops, cut_vars)
if i == 0:
for op in ops: for op in ops:
if self._is_lr_role_op(op): input_arg_names = op.input_arg_names
cur_ops.append(op) if param_name in input_arg_names:
#prevent inplace in/out self._param_device_map[param_name] = op.attr(
program["input_set"].update( self._op_device_key)
self._find_input_output( break
cur_ops, [], is_forward=True))
for e in cur_ops: def _add_opdevice_attr_for_regularization_clip(self, block):
ops.remove(e) """
Add op_device attribute for regulization and clip ops.
if i < cut_len: """
sec_params.append( for op in block.ops:
self._find_persistable_vars(cur_ops, whole_parameters)) # role for regularization and clip ops is optimize
if i >= cut_len - 1: if int(op.attr(self._op_role_key)) != int(self._op_role.Optimize):
opt_ops = self._find_section_opt( continue
ops, sec_params[2 * cut_len - 2 - i]) if op.has_attr(self._op_device_key) and (
op.attr(self._op_device_key) != ""):
for e in opt_ops: continue
ops.remove(e) assert self._op_role_var_key in op.attr_names
cur_ops += opt_ops op_role_var = op.all_attrs()[self._op_role_var_key]
assert len(op_role_var) == 2
op_descs = [op.desc for op in cur_ops] param_name = block.vars[op_role_var[0]].name
for op_desc in op_descs: device = self._param_device_map[param_name]
ap_op = program["program"].block(0).desc.append_op() op._set_attr(self._op_device_key, device)
ap_op.copy_from(op_desc)
program["input_set"].update( def _add_default_opdevice_attr(self, block):
self._find_input_output( """
cur_ops, cut_vars, is_forward=True)) 1. Add default op_device attribute for lr-related ops.
program["input_set"].update(sec_params[min(i, 2 * cut_len - 2 - i)]) The default value is the one that of the first place.
program["output_set"].update( 2. Add default op_device attribute for sum ops added during
self._find_input_output( backward. For these ops, we set the op_device attribute
cur_ops, cut_vars, is_forward=False)) as the one of its post op, i.e, which op has the output of the
programs.append(program) sum op as an input.
program = { """
"program": Program(), first_devcie = ""
"input_set": set(),
"output_set": set() # Get the device spec of the first place.
} # device_spec: 'cpu' for cpu device and 'gpu:id' for gpu device,
op_descs = [op.desc for op in ops] # e.g. 'gpu:0', 'gpu:1', etc.
for op_desc in op_descs: for op in block.ops:
ap_op = program["program"].block(0).desc.append_op() if op.has_attr(self._op_device_key) and (
op.attr(self._op_device_key) != ""):
first_device = op.attr(self._op_device_key)
break
assert first_device
# set op_device attr for lr-related ops
lrsched_role = int(self._op_role.LRSched)
for op in block.ops:
if not op.has_attr(self._op_device_key) or (
op.attr(self._op_device_key) == ""):
if op.type == "sum":
# For sum ops that compute the sum of @RENAMED@ vars
for name in op.desc.input_arg_names():
assert '@RENAME@' in name
assert len(op.desc.output_arg_names()) == 1
out_name = op.desc.output_arg_names()[0]
post_op = self._find_post_op(block.ops, op, out_name)
device = post_op.attr(self._op_device_key)
assert device
op._set_attr(self._op_device_key, device)
continue
assert op.attr(self._op_role_key) == lrsched_role, (
"Op whose op_device attr has not been set for pipeline"
" must be of the role LRSched.")
op._set_attr(self._op_device_key, first_device)
def _check_validation(self, block):
"""
Check whether ops in a block are all validate (i.e., the
op_device attribute has been set).
Then, return all device specifications in order.
"""
device_specs = []
for op in block.ops:
type = op.type
if not op._has_kernel(type):
assert op.type == "conditional_block" and (
op.attr(self._op_role_key) == int(self._op_role.LRSched)), (
"Now, the only supported op without kernel is "
"conditional_block, and its op role must be LRSched.")
assert op.has_attr(self._op_device_key), (
"op ({}) has no {} attribute.".format(op.type,
self._op_device_key))
dev_spec = op.attr(self._op_device_key)
assert dev_spec, ("op_device attribute for op "
"{} has not been set.".format(op.type))
if not dev_spec in device_specs:
device_specs.append(dev_spec)
return device_specs
def _insert_enq_deq_ops_for_boundaries(self, block, origin_block,
startup_program):
"""
Insert a pair of enqueue and dequeue ops for every two
consecutive ops on different devices.
"""
startup_block = startup_program.global_block()
extra_index = 0
# A map from var to device spec where op takes it as input,
# avoiding multiple enqueue and dequeue ops.
var_devspec = dict()
for index, op in list(enumerate(origin_block.ops)):
cur_device_spec = op.attr(self._op_device_key)
for var_name in op.input_arg_names:
# i.e., lod_tensor_blocking_queue created by DataLoader,
# which only exists in startup program.
if not var_name in origin_block.vars: continue
var = block.var(var_name)
# skip data, because we will process it later
if var.is_data: continue
prev_op = self._find_real_prev_op(origin_block.ops, op,
var_name)
if prev_op is None:
continue
prev_device_spec = prev_op.attr(self._op_device_key)
if prev_device_spec != cur_device_spec:
if var_name not in var_devspec:
var_devspec[var_name] = []
if cur_device_spec in var_devspec[var_name]: continue
var_devspec[var_name].append(cur_device_spec)
queue_name = var_name + "_blocking_queue"
queue_name = unique_name.generate(queue_name)
queue_var = startup_block.create_var(
name=queue_name,
persistable=True,
type=core.VarDesc.VarType.RAW)
startup_block.append_op(
type='queue_generator',
attrs={
'names': [queue_name],
'capacity': self._num_microbatches
})
op_role = op.all_attrs()[self._op_role_key]
var = block.vars[var_name]
block._insert_op(
index=index + extra_index,
type='enqueue',
inputs={'X': var},
attrs={
'queue_name': queue_name,
self._op_device_key: prev_device_spec,
self._op_role_key: op_role
})
extra_index += 1
block._insert_op(
index=index + extra_index,
type='dequeue',
outputs={'Out': [var]},
attrs={
self._op_device_key: cur_device_spec,
'queue_name': queue_name,
self._op_role_key: op_role
})
extra_index += 1
def _add_dequeue_ops_for_optimize(self, block, startup_program):
startup_block = startup_program.global_block()
grad_queue_map = dict()
grad_device_map = dict()
optimize_index = None
grad_names_to_dequeue = []
for index, op in reversed(list(enumerate(block.ops))):
device = op.attr(self._op_device_key)
# Optimizer pass
if not self._is_optimize_op(op):
optimize_index = index + 1
break
if not self._is_update_op(op): continue
assert self._op_role_var_key in op.attr_names
op_role_var = op.all_attrs()[self._op_role_var_key]
assert len(op_role_var) == 2
grad_name = op_role_var[1]
assert grad_name not in grad_device_map
assert grad_name not in grad_names_to_dequeue
grad_device_map[grad_name] = device
grad_names_to_dequeue.append(grad_name)
for grad_name in grad_names_to_dequeue:
device = grad_device_map[grad_name]
grad_names = []
grads = []
queue_name = grad_name + "_blocking_queue"
queue_name = unique_name.generate(queue_name)
grad_queue_map[grad_name] = queue_name
ref_var = block.vars[grad_name]
queue_var = startup_block.create_var(
name=queue_name,
persistable=True,
type=core.VarDesc.VarType.RAW)
startup_block.append_op(
type='queue_generator',
attrs={
'names': [queue_name],
'capacity': self._num_microbatches
})
orig_var_name = self._strip_grad_suffix(grad_name)
for _ in range(self._num_microbatches):
u_name = unique_name.generate(orig_var_name)
u_grad_name = self._append_grad_suffix(u_name)
grad_var = self._create_var(block, ref_var, u_grad_name)
grad_names.append(u_grad_name)
grads.append(grad_var)
block._insert_op(
index=optimize_index,
type='dequeue',
outputs={'Out': grads},
attrs={
self._op_device_key: device,
'queue_name': queue_name,
self._op_role_key: self._op_role.Optimize
})
block._insert_op(
index=optimize_index + 1,
type='sum',
inputs={'X': grad_names},
outputs={'Out': ref_var},
attrs={
self._op_device_key: device,
self._op_role_key: self._op_role.Optimize
})
return grad_queue_map
def _insert_enq_deq_ops_for_update(self, block, startup_program):
"""
Insert enqueue and dequeue ops for gradients of parameters.
"""
startup_block = startup_program.global_block()
grad_queue_map = self._add_dequeue_ops_for_optimize(block,
startup_program)
for index, op in reversed(list(enumerate(block.ops))):
offset = index
device = op.attr(self._op_device_key)
# Backward pass
if self._is_loss_grad_op(op):
loss_grad_var = block.vars[op.output_arg_names[0]]
scale_factor = self._num_microbatches
block._insert_op(
index=index + 1,
type='scale',
inputs={'X': loss_grad_var},
outputs={'Out': loss_grad_var},
attrs={
'scale': 1.0 / scale_factor,
self._op_device_key: device,
self._op_role_key: self._op_role.Backward
})
break
if self._is_backward_op(op) and (
self._op_role_var_key in op.attr_names):
op_role_var = op.all_attrs()[self._op_role_var_key]
if len(op_role_var) == 0:
continue
assert len(op_role_var) % 2 == 0
for i in range(0, len(op_role_var), 2):
grad_name = op_role_var[i + 1]
grad_var = block.vars[grad_name]
assert grad_name in grad_queue_map
queue_name = grad_queue_map[grad_name]
block._insert_op(
index=offset + 1,
type='enqueue',
inputs={'X': block.vars[grad_name]},
attrs={
'queue_name': queue_name,
self._op_device_key: device,
self._op_role_key: self._op_role.Backward
})
offset += 1
def _add_sub_blocks(self, main_block, program_list):
main_program = main_block.program
for prog_info in program_list:
prog = prog_info['program']
for op in prog.block(0).ops:
if not op.has_attr('sub_block'):
continue
origin_sub_block_id = op.attr('sub_block').id
origin_sub_block = main_program.block(origin_sub_block_id)
new_sub_block = prog._create_block(parent_idx=0)
for op in origin_sub_block.ops:
op_desc = op.desc
ap_op = new_sub_block.desc.append_op()
ap_op.copy_from(op_desc) ap_op.copy_from(op_desc)
program["input_set"].update( new_sub_block._sync_with_cpp()
[cut_var.name + "@GRAD" for cut_var in cut_list[0]]) op._set_attr('sub_block:', new_sub_block)
program["input_set"].update(
self._find_input_output( def _get_device_info(self, block):
ops, [], is_forward=True)) for op in block.ops:
program["input_set"].update(sec_params[0]) if not op._has_kernel(op.type): continue
programs.append(program) op_device = op.attr(self._op_device_key)
inputs = set() return op_device
for program in reversed(list(programs)):
output_list = list(program["output_set"]) def _process_persistable_vars_in_multi_sections(self, main_program,
for output in output_list: startup_prog, program_list):
if output not in inputs: """
program["output_set"].remove(output) Special Case: process persistable vars that exist in
inputs.update(program["input_set"]) multiple sections, e.g., shared weight
return programs """
# var_info = {var_name: [program1, program2...]},
# persistable var only
var_info = dict()
for prog_info in program_list:
prog = prog_info['program']
block = prog.block(0)
for var_name in block.vars:
var = block.var(var_name)
if not var.persistable: continue
if not var_name in var_info:
var_info[var_name] = []
if not prog in var_info[var_name]:
var_info[var_name].append(prog)
for var_name in list(var_info.keys()):
if len(var_info[var_name]) == 1:
var_info.pop(var_name)
# write_info = {var_name: program}, where program is the only program
# in which the var named var_name is written.
write_info = dict()
for var_name in var_info.keys():
for prog in var_info[var_name]:
block = prog.block(0)
for op in block.ops:
if op.type == "dequeue": continue
# We have processed lr related vars
if op.attr(self._op_role_key) == int(
self._op_role.Optimize.LRSched):
continue
if var_name in op.desc.output_arg_names():
assert var_name not in write_info, (
"two sections write the same var({}): second "
"op {}.".format(var_name, op))
write_info[var_name] = prog
break
for var_name in var_info.keys():
# Case 1: read only variables, no special process
if not var_name in write_info: continue
# Case 2: one write multiple reads
write_prog = write_info[var_name]
write_block = write_prog.block(0)
write_device = self._get_device_info(write_block)
all_progs = var_info[var_name]
for prog in all_progs:
if prog == write_prog: continue
queue_name = var_name + "_blocking_queue"
queue_name = unique_name.generate(queue_name)
queue_var = startup_prog.block(0).create_var(
name=queue_name,
persistable=True,
type=core.VarDesc.VarType.RAW)
startup_prog.block(0).append_op(
type='queue_generator',
attrs={
'names': [queue_name],
'capacity': self._num_microbatches
})
write_block._insert_op(
index=0,
type='enqueue',
inputs={'X': write_block.var(var_name), },
attrs={
'queue_name': queue_name,
self._op_device_key: write_device,
# A trick to make the role LRSched to avoid copy every
# microbatch
self._op_role_key: self._op_role.LRSched
})
read_block = prog.block(0)
read_device = self._get_device_info(read_block)
read_block._insert_op(
index=0,
type='dequeue',
outputs={'Out': [read_block.var(var_name)]},
attrs={
self._op_device_key: read_device,
# A trick to make the role LRSched to avoid copy every
# microbatch
self._op_role_key: self._op_role.LRSched,
'queue_name': queue_name,
})
def minimize(self, def minimize(self,
loss, loss,
startup_program=None, startup_program=None,
parameter_list=None, parameter_list=None,
no_grad_set=None): no_grad_set=None):
self._optimizer.minimize(loss, startup_program, parameter_list, main_block = loss.block
no_grad_set) if startup_program is None:
program = loss.block.program startup_program = default_startup_program()
if len(self._cut_list) == 0: optimize_ops, params_grads = self._optimizer.minimize(
loss, startup_program, parameter_list, no_grad_set)
self._update_param_device_map(params_grads, main_block)
# Step1: add default op_device attribute for regulization and clip ops
self._add_opdevice_attr_for_regularization_clip(main_block)
# Step2: add default op_device attribute for ops whose op_device
# attribute have not been set yet.
self._add_default_opdevice_attr(main_block)
device_specs = self._check_validation(main_block)
# Step3: add enqueue and dequeue ops between section boundaries
origin_prog = main_block.program.clone(for_test=False)
origin_main_block = origin_prog.global_block()
self._insert_enq_deq_ops_for_boundaries(main_block, origin_main_block,
startup_program)
# Step4: add a pair of enqueue and dequeueN for parameter gradients
self._insert_enq_deq_ops_for_update(main_block, startup_program)
main_program = main_block.program
place_list = []
place_id_list = []
for dev_spec in device_specs:
if dev_spec == "cpu":
place_list.append(core.CPUPlace())
place_id_list.append(-1)
elif "gpu" in dev_spec and ":" in dev_spec:
dev_index = dev_spec.split(":")[1]
place_list.append(core.CUDAPlace(int(dev_index)))
place_id_list.append(int(dev_index))
else:
raise ValueError("Unknown device type: %s", dev_spec)
# Step5: split program into sections and add pairs of
# enqueue and dequeue ops for data var.
if len(place_list) == 0:
program_list = [] program_list = []
ptmp = {"program": program, "input_set": set(), "output_set": set()} ptmp = {
"program": main_program,
"input_set": set(),
"output_set": set()
}
program_list.append(ptmp) program_list.append(ptmp)
else: else:
program_list = self._split_program(program, self._cut_list) program_list = self._split_program(main_program)
for p in program_list: for p in program_list:
self._create_vars(p["program"].block(0), program) self._create_vars(p["program"].block(0), main_program)
whole_parameters = [e.name for e in program.block(0).all_parameters()] self._insert_enq_deq_for_data_var(main_block, program_list,
param_need_sync = [] startup_program, device_specs)
for i, section_p in enumerate(program_list):
if not isinstance(self._place_list[i], core.CUDAPlace): # Step6: Special Case: process persistable vars that exist in
continue # multiple sections
section_var = [e for e in section_p["program"].block(0).vars] self._process_persistable_vars_in_multi_sections(
for p in section_var: main_program, startup_program, program_list)
if p in whole_parameters:
param_need_sync.append(p) # Step7: Add sub blocks for section programs
program._pipeline_opt = { self._add_sub_blocks(main_block, program_list)
main_program._pipeline_opt = {
"trainer": "PipelineTrainer", "trainer": "PipelineTrainer",
"device_worker": "Section", "device_worker": "Section",
"section_program_list": program_list, "section_program_list": program_list,
"place_list": self._place_list, "place_list": place_list,
"concurrency_list": self._concurrency_list, "place_id_list": place_id_list,
"queue_size": self._queue_size, "sync_steps": -1,
"queue_size": self._num_microbatches,
"start_cpu_core_id": self._start_cpu_core_id, "start_cpu_core_id": self._start_cpu_core_id,
"sync_steps": self._sync_steps,
"param_need_sync": param_need_sync
} }
return optimize_ops, params_grads, program_list
class RecomputeOptimizer(Optimizer): class RecomputeOptimizer(Optimizer):
......
...@@ -49,7 +49,6 @@ endif() ...@@ -49,7 +49,6 @@ endif()
LIST(REMOVE_ITEM TEST_OPS test_conv2d_transpose_op) LIST(REMOVE_ITEM TEST_OPS test_conv2d_transpose_op)
if(WIN32) if(WIN32)
LIST(REMOVE_ITEM TEST_OPS test_boxps) LIST(REMOVE_ITEM TEST_OPS test_boxps)
LIST(REMOVE_ITEM TEST_OPS test_paddlebox_datafeed)
LIST(REMOVE_ITEM TEST_OPS test_trainer_desc) LIST(REMOVE_ITEM TEST_OPS test_trainer_desc)
LIST(REMOVE_ITEM TEST_OPS test_multiprocess_reader_exception) LIST(REMOVE_ITEM TEST_OPS test_multiprocess_reader_exception)
LIST(REMOVE_ITEM TEST_OPS test_avoid_twice_initialization) LIST(REMOVE_ITEM TEST_OPS test_avoid_twice_initialization)
...@@ -89,7 +88,6 @@ endif() ...@@ -89,7 +88,6 @@ endif()
if(NOT WITH_GPU OR WIN32) if(NOT WITH_GPU OR WIN32)
LIST(REMOVE_ITEM TEST_OPS test_pipeline) LIST(REMOVE_ITEM TEST_OPS test_pipeline)
LIST(REMOVE_ITEM TEST_OPS test_boxps) LIST(REMOVE_ITEM TEST_OPS test_boxps)
LIST(REMOVE_ITEM TEST_OPS test_paddlebox_datafeed)
endif() endif()
list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
......
...@@ -87,117 +87,5 @@ class TestRunCmd(unittest.TestCase): ...@@ -87,117 +87,5 @@ class TestRunCmd(unittest.TestCase):
self.assertTrue(ret2 == 0) self.assertTrue(ret2 == 0)
class TestBoxPSPreload(unittest.TestCase):
""" TestCases for BoxPS Preload """
def test_boxps_cpu(self):
self.run_boxps_preload(True, True)
self.run_boxps_preload(True, False)
def test_boxps_gpu(self):
self.run_boxps_preload(False, True)
self.run_boxps_preload(False, False)
def run_boxps_preload(self, is_cpu=True, random_with_lineid=False):
program = fluid.Program()
with fluid.program_guard(program):
x = fluid.layers.data(
name='x', shape=[1], dtype='int64', lod_level=0)
y = fluid.layers.data(
name='y', shape=[1], dtype='int64', lod_level=0)
z = layers.data(name='z', shape=[1], dtype='int64')
emb_x, emb_y = _pull_box_sparse([x, y], size=2)
emb_xp = _pull_box_sparse(x, size=2)
concat = layers.concat([emb_x, emb_y], axis=1)
fc = layers.fc(input=concat,
name="fc",
size=1,
num_flatten_dims=1,
bias_attr=False)
loss = layers.reduce_mean(fc)
place = fluid.CPUPlace(
) if is_cpu or not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
exe = fluid.Executor(place)
batch_size = 100
def binary_print(slot, fout):
fout.write(str(len(slot)) + " ")
for e in slot:
fout.write(str(e) + " ")
batch1 = np.ones(
(batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
filelist = []
place_str = "cpu" if is_cpu else "gpu"
for i in range(2):
filelist.append("test_hdfs_" + place_str + "_" + str(i))
for f in filelist:
with open(f, "w") as fout:
for ins in batch1:
for slot in ins:
binary_print(slot, fout)
fout.write("\n")
def create_dataset():
dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
dataset.set_date("20190930")
dataset.set_use_var([x, y])
dataset.set_batch_size(2)
dataset.set_thread(1)
dataset.set_filelist(filelist)
return dataset
datasets = []
datasets.append(create_dataset())
datasets.append(create_dataset())
optimizer = fluid.optimizer.SGD(learning_rate=0.5)
optimizer = fluid.optimizer.PipelineOptimizer(
optimizer,
cut_list=[],
place_list=[place],
concurrency_list=[1],
queue_size=1,
sync_steps=-1)
optimizer.minimize(loss)
program._pipeline_opt["dump_fields"] = [
"fc.tmp_0", "fc.tmp_0@GRAD", "fake_var", "z",
"reduce_mean_3.tmp_0"
]
# fake_var: not in scope
# z: in scope, but no initialized
# reduce_mean_0.tmp_0, dimension is not right
program._pipeline_opt["dump_fields_path"] = "./dump_log/"
program._pipeline_opt["dump_param"] = ["fc.w_0"]
program._pipeline_opt["enable_random_dump"] = True
program._pipeline_opt["dump_interval"] = 10
program._pipeline_opt["random_with_lineid"] = random_with_lineid
exe.run(fluid.default_startup_program())
datasets[0].load_into_memory()
datasets[0].begin_pass()
datasets[0].slots_shuffle([])
datasets[1].preload_into_memory()
exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=datasets[0],
print_period=1)
datasets[0].end_pass(True)
datasets[1].wait_preload_done()
datasets[1].begin_pass()
exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=datasets[1],
print_period=1,
debug=True)
datasets[1].end_pass(False)
for f in filelist:
os.remove(f)
if os.path.isdir("dump_log"):
shutil.rmtree("dump_log")
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -437,129 +437,6 @@ class TestDataNormOpWithSlotDim(OpTest): ...@@ -437,129 +437,6 @@ class TestDataNormOpWithSlotDim(OpTest):
self.check_grad(['X'], 'Y', no_grad_set=set([])) self.check_grad(['X'], 'Y', no_grad_set=set([]))
class TestDataNormOpWithSyncStats(unittest.TestCase):
"""
test class for data norm op
test forward and backward
"""
def test_sync_stats(self):
if not core.is_compiled_with_cuda():
return
if os.name == 'nt':
print(
'Skip TestDataNormOpWithSyncStats because nccl is not supported on windows'
)
return
x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
emb = layers.embedding(
input=x,
param_attr=fluid.ParamAttr(name="embx"),
size=[10, 2],
is_sparse=False)
dn = layers.data_norm(
input=emb,
name="hehe",
epsilon=1e-4,
param_attr={
"batch_size": 1e4,
"batch_sum": 1e5,
"batch_square": 1e4
},
summary_decay_rate=1,
sync_stats=True) #[-1,3]
loss = layers.mean(dn)
optimizer = fluid.optimizer.SGD(learning_rate=0.5)
optimizer = fluid.optimizer.PipelineOptimizer(
optimizer,
cut_list=[[emb], [loss]],
place_list=[
fluid.CUDAPlace(0), fluid.CUDAPlace(0), fluid.CPUPlace()
],
concurrency_list=[1, 1, 1],
queue_size=1,
sync_steps=10000000, )
all_p = fluid.default_main_program().global_block().all_parameters()
parameter_without_datanorm = []
for e in all_p:
if e.name.find("batch_size") != -1 or e.name.find(
"batch_sq") != -1 or e.name.find("batch_sum") != -1:
continue
parameter_without_datanorm.append(e.name)
optimizer.minimize(loss, parameter_list=parameter_without_datanorm)
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
#prepare data
batch_size = 1
def binary_print(slot, fout):
num = np.int16(len(slot) + 1)
num.tofile(fout)
a = np.int64(batch_size)
a.tofile(fout)
slot.tofile(fout)
#batch1 = np.array([[0,1], [1,2], [2,3]]).astype("int64").reshape(batch_size,2,1)
#batch2 = np.array([[1,2], [2,3], [3,4]]).astype("int64").reshape(batch_size,2,1)
batch1 = np.ones(
(batch_size, 1)).astype("int64").reshape(batch_size, 1, 1)
batch2 = np.ones(
(batch_size, 1)).astype("int64").reshape(batch_size, 1, 1)
data = [batch1, batch2]
data = [batch1]
filelist = []
for i in range(2):
filelist.append("test_pipeline_input_" + str(i))
for f in filelist:
with open(f, "wb") as fout:
for batch_data in data:
for ins in batch_data:
for slot in ins:
binary_print(slot, fout)
dataset = fluid.DatasetFactory().create_dataset("FileInstantDataset")
dataset.set_use_var([x])
dataset.set_batch_size(batch_size)
dataset.set_filelist(filelist)
block = fluid.default_startup_program().global_block()
block.append_op(
type='c_comm_init_all', attrs={'ring_id': 0,
'devices': [0, 1]})
with open("main_program", "w") as fout:
fout.write(str(fluid.default_main_program()))
with open("startup_program", "w") as fout:
fout.write(str(fluid.default_startup_program()))
exe.run(fluid.default_startup_program())
emb_t = fluid.global_scope().find_var("embx").get_tensor()
para = np.ones((10, 2)).astype("float32")
emb_t.set(para, place)
for epoch in range(1):
exe.train_from_dataset(
fluid.default_main_program(),
dataset,
thread=2,
debug=False,
fetch_list=[],
fetch_info=[],
print_period=1)
batch_size = np.array(fluid.global_scope().find_var("hehe.batch_size")
.get_tensor())
self.assertEqual(batch_size[0], 10002)
b = np.array(fluid.global_scope().find_var("hehe.batch_sum").get_tensor(
))
self.assertEqual(b[0], 100002)
c = np.array(fluid.global_scope().find_var("hehe.batch_square_sum")
.get_tensor())
self.assertEqual(c[0], 10162)
for f in filelist:
os.remove(f)
class TestDataNormOpErrorr(unittest.TestCase): class TestDataNormOpErrorr(unittest.TestCase):
def test_errors(self): def test_errors(self):
with program_guard(Program(), Program()): with program_guard(Program(), Program()):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.fluid as fluid
import paddle.fluid.core as core
import os
import unittest
import paddle.fluid.layers as layers
class TestDataFeed(unittest.TestCase):
""" TestBaseCase(Merge PV) """
def setUp(self):
self.batch_size = 10
self.pv_batch_size = 10
self.enable_pv_merge = True
self.merge_by_sid = True
def set_data_config(self):
self.dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
self.dataset.set_feed_type("PaddleBoxDataFeed")
self.dataset.set_parse_logkey(True)
self.dataset.set_thread(1)
self.dataset.set_enable_pv_merge(self.enable_pv_merge)
self.dataset.set_batch_size(self.batch_size)
if self.enable_pv_merge:
self.dataset.set_merge_by_sid(self.merge_by_sid)
self.dataset.set_rank_offset("rank_offset")
self.dataset.set_pv_batch_size(self.pv_batch_size)
def test_pboxdatafeed(self):
self.run_dataset(False)
def test_pboxdatafeed(self):
self.run_dataset(True)
def run_dataset(self, is_cpu):
x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0)
rank_offset = fluid.layers.data(
name="rank_offset",
shape=[-1, 7],
dtype="int32",
lod_level=0,
append_batch_size=False)
emb_x, emb_y = fluid.contrib.layers._pull_box_extended_sparse(
[x, y], size=2, extend_size=128)
concat = layers.concat([emb_x[0], emb_x[1], emb_y[0], emb_y[1]], axis=1)
fc = layers.fc(input=concat,
name="fc",
size=1,
num_flatten_dims=1,
bias_attr=False)
loss = layers.reduce_mean(fc)
place = fluid.CPUPlace() if is_cpu or not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
exe = fluid.Executor(place)
with open("test_run_with_dump_a.txt", "w") as f:
data = "1 1702f830eee19501ad7429505f714c1d 1 1 1 9\n"
data += "1 1702f830eee19502ad7429505f714c1d 1 2 1 8\n"
data += "1 1702f830eee19503ad7429505f714c1d 1 3 1 7\n"
data += "1 1702f830eee0de01ad7429505f714c2d 1 4 1 6\n"
data += "1 1702f830eee0df01ad7429505f714c3d 1 5 1 5\n"
data += "1 1702f830eee0df02ad7429505f714c3d 1 6 1 4\n"
f.write(data)
with open("test_run_with_dump_b.txt", "w") as f:
data = "1 1702f830fff22201ad7429505f715c1d 1 1 1 1\n"
data += "1 1702f830fff22202ad7429505f715c1d 1 2 1 2\n"
data += "1 1702f830fff22203ad7429505f715c1d 1 3 1 3\n"
data += "1 1702f830fff22101ad7429505f714ccd 1 4 1 4\n"
data += "1 1702f830fff22102ad7429505f714ccd 1 5 1 5\n"
data += "1 1702f830fff22103ad7429505f714ccd 1 6 1 6\n"
data += "1 1702f830fff22104ad7429505f714ccd 1 6 1 7\n"
f.write(data)
self.set_data_config()
self.dataset.set_use_var([x, y])
self.dataset.set_filelist(
["test_run_with_dump_a.txt", "test_run_with_dump_b.txt"])
optimizer = fluid.optimizer.SGD(learning_rate=0.5)
optimizer = fluid.optimizer.PipelineOptimizer(
optimizer,
cut_list=[],
place_list=[place],
concurrency_list=[1],
queue_size=1,
sync_steps=-1)
optimizer.minimize(loss)
exe.run(fluid.default_startup_program())
self.dataset.set_current_phase(1)
self.dataset.load_into_memory()
self.dataset.preprocess_instance()
self.dataset.begin_pass()
pv_num = self.dataset.get_pv_data_size()
exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=self.dataset,
print_period=1)
self.dataset.set_current_phase(0)
self.dataset.postprocess_instance()
exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=self.dataset,
print_period=1)
self.dataset.end_pass(True)
os.remove("test_run_with_dump_a.txt")
os.remove("test_run_with_dump_b.txt")
class TestDataFeed2(TestDataFeed):
""" TestBaseCase(Merge PV not merge by sid) """
def setUp(self):
self.batch_size = 10
self.pv_batch_size = 10
self.enable_pv_merge = True
self.merge_by_sid = False
class TestDataFeed3(TestDataFeed):
""" TestBaseCase(Not Merge PV) """
def setUp(self):
self.batch_size = 10
self.pv_batch_size = 10
self.enable_pv_merge = False
if __name__ == '__main__':
unittest.main()
...@@ -19,137 +19,162 @@ import numpy as np ...@@ -19,137 +19,162 @@ import numpy as np
import os import os
import shutil import shutil
import unittest import unittest
import math
class TestPipelineConfig(unittest.TestCase):
""" TestCases for Config in Pipeline Training. """ def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
act=None):
def config(self, filelist_length, pipeline_num, reader_concurrency): conv = fluid.layers.conv2d(
filelist = [] input=input,
for i in range(filelist_length): num_filters=num_filters,
filelist.append("file" + str(i)) filter_size=filter_size,
self.dataset.set_filelist(filelist) stride=stride,
self.pipeline_opt["concurrency_list"][0] = reader_concurrency padding=(filter_size - 1) // 2,
self.pipeline_num = pipeline_num groups=groups,
act=None,
def helper(self, in_filelist_length, in_pipeline_num, in_reader_concurrency, bias_attr=False)
out_pipeline_num, out_reader_concurrency, out_dataset_thread): return fluid.layers.batch_norm(
self.config(in_filelist_length, in_pipeline_num, in_reader_concurrency) input=conv,
res = self.exe._adjust_pipeline_resource( act=act, )
self.pipeline_opt, self.dataset, self.pipeline_num)
self.assertEqual(self.pipeline_opt["concurrency_list"][0],
out_reader_concurrency) def shortcut(input, ch_out, stride, is_first):
self.assertEqual(res, out_pipeline_num) ch_in = input.shape[1]
self.assertEqual(self.dataset.thread_num, out_dataset_thread) if ch_in != ch_out or stride != 1 or is_first == True:
return conv_bn_layer(input, ch_out, 1, stride)
def test_adjust_pipeline_resource(self): else:
self.exe = fluid.Executor(fluid.CPUPlace()) return input
self.dataset = fluid.DatasetFactory().create_dataset(
"FileInstantDataset")
self.pipeline_opt = {"concurrency_list": [0, 1, 2]} def bottleneck_block(input, num_filters, stride):
self.pipeline_num = 0 conv0 = conv_bn_layer(
input=input, num_filters=num_filters, filter_size=1, act='relu')
self.helper(7, 2, 2, 2, 2, 4) conv1 = conv_bn_layer(
self.helper(7, 2, 3, 2, 3, 6) input=conv0,
self.helper(7, 2, 4, 2, 3, 6) num_filters=num_filters,
filter_size=3,
self.helper(8, 2, 3, 2, 3, 6) stride=stride,
self.helper(8, 2, 4, 2, 4, 8) act='relu')
self.helper(8, 2, 5, 2, 4, 8) conv2 = conv_bn_layer(
input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
self.helper(3, 4, 1, 3, 1, 3)
self.helper(3, 4, 2, 3, 1, 3) short = shortcut(input, num_filters * 4, stride, is_first=False)
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def basic_block(input, num_filters, stride, is_first):
conv0 = conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=3,
act='relu',
stride=stride)
conv1 = conv_bn_layer(
input=conv0, num_filters=num_filters, filter_size=3, act=None)
short = shortcut(input, num_filters, stride, is_first)
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
def build_network(input, layers=50, class_dim=1000):
supported_layers = [18, 34, 50, 101, 152]
assert layers in supported_layers
depth = None
if layers == 18:
depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
num_filters = [64, 128, 256, 512]
with fluid.device_guard("cpu"):
conv = conv_bn_layer(
input=input, num_filters=64, filter_size=7, stride=2, act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
if layers >= 50:
for block in range(len(depth)):
with fluid.device_guard("cpu"):
for i in range(depth[block]):
conv = bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1)
with fluid.device_guard("gpu:0"):
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = fluid.layers.fc(
input=pool,
size=class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
else:
for block in range(len(depth)):
with fluid.device_guard("cpu"):
for i in range(depth[block]):
conv = basic_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
is_first=block == i == 0)
with fluid.device_guard("gpu:0"):
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = fluid.layers.fc(
input=pool,
size=class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
return out
class TestPipeline(unittest.TestCase): class TestPipeline(unittest.TestCase):
""" TestCases for Pipeline Training. """ """ TestCases for Pipeline Training. """
def test_pipeline(self): def test_pipeline(self):
x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0) with fluid.device_guard("cpu"):
y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0) image = fluid.layers.data(
emb_x = layers.embedding( name="image", shape=[3, 224, 224], dtype="float32")
input=x, label = fluid.layers.data(name="label", shape=[1], dtype="int64")
param_attr=fluid.ParamAttr(name="embx"), data_loader = fluid.io.DataLoader.from_generator(
size=[10, 2], feed_list=[image, label],
is_sparse=False) capacity=64,
emb_y = layers.embedding( use_double_buffer=True,
input=y, iterable=False)
param_attr=fluid.ParamAttr( fc = build_network(image, layers=50)
name="emby", learning_rate=0.9), with fluid.device_guard("gpu:0"):
size=[10, 2], out, prob = fluid.layers.softmax_with_cross_entropy(
is_sparse=False) logits=fc, label=label, return_softmax=True)
loss = fluid.layers.mean(out)
concat = layers.concat([emb_x, emb_y], axis=1) acc_top1 = fluid.layers.accuracy(input=prob, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=prob, label=label, k=5)
fc = layers.fc(input=concat,
name="fc", base_lr = 0.1
size=1, passes = [30, 60, 80, 90]
num_flatten_dims=1, total_images = 1281167
bias_attr=False) steps_per_pass = total_images // 128
loss = layers.reduce_mean(fc) bd = [steps_per_pass * p for p in passes]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.SGD(learning_rate=0.5) lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
optimizer = fluid.optimizer.Momentum(
lr_val,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer = fluid.optimizer.PipelineOptimizer( optimizer = fluid.optimizer.PipelineOptimizer(
optimizer, optimizer, num_microbatches=2)
cut_list=[[emb_x, emb_y], [loss]],
place_list=[
fluid.CPUPlace(), fluid.CUDAPlace(0), fluid.CPUPlace()
],
concurrency_list=[1, 1, 1],
queue_size=1,
sync_steps=10000000, )
optimizer.minimize(loss) optimizer.minimize(loss)
place = fluid.CPUPlace()
exe = fluid.Executor(place) def test_pipeline_noneoptimizer(self):
exe.run(fluid.default_startup_program()) with fluid.device_guard("gpu:0"):
#prepare data
batch_size = 100
def binary_print(slot, fout):
num = np.int16(len(slot) + 1)
num.tofile(fout)
a = np.int64(batch_size)
a.tofile(fout)
slot.tofile(fout)
#batch1 = np.array([[0,1], [1,2], [2,3]]).astype("int64").reshape(batch_size,2,1)
#batch2 = np.array([[1,2], [2,3], [3,4]]).astype("int64").reshape(batch_size,2,1)
batch1 = np.ones(
(batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
batch2 = np.ones(
(batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
data = [batch1, batch2]
filelist = []
for i in range(2):
filelist.append("test_pipeline_input_" + str(i))
for f in filelist:
with open(f, "wb") as fout:
for batch_data in data:
for ins in batch_data:
for slot in ins:
binary_print(slot, fout)
dataset = fluid.DatasetFactory().create_dataset("FileInstantDataset")
dataset.set_use_var([x, y])
dataset.set_batch_size(batch_size)
dataset.set_filelist(filelist)
for epoch in range(1):
exe.train_from_dataset(
fluid.default_main_program(),
dataset,
thread=1,
debug=False,
fetch_list=[],
fetch_info=[],
print_period=1)
for f in filelist:
os.remove(f)
def single_section(self, random_dump):
program = fluid.Program()
with fluid.program_guard(program):
x = fluid.layers.data( x = fluid.layers.data(
name='x', shape=[1], dtype='int64', lod_level=0) name='x', shape=[1], dtype='int64', lod_level=0)
y = fluid.layers.data( y = fluid.layers.data(
...@@ -159,16 +184,8 @@ class TestPipeline(unittest.TestCase): ...@@ -159,16 +184,8 @@ class TestPipeline(unittest.TestCase):
param_attr=fluid.ParamAttr(name="embx"), param_attr=fluid.ParamAttr(name="embx"),
size=[10, 2], size=[10, 2],
is_sparse=False) is_sparse=False)
emb_y = layers.embedding(
input=y,
param_attr=fluid.ParamAttr(
name="emby", learning_rate=0.9),
size=[10, 2],
is_sparse=False)
concat = layers.concat([emb_x, emb_y], axis=1)
fc = layers.fc(input=concat, fc = layers.fc(input=emb_x,
name="fc", name="fc",
size=1, size=1,
num_flatten_dims=1, num_flatten_dims=1,
...@@ -176,77 +193,9 @@ class TestPipeline(unittest.TestCase): ...@@ -176,77 +193,9 @@ class TestPipeline(unittest.TestCase):
loss = layers.reduce_mean(fc) loss = layers.reduce_mean(fc)
optimizer = fluid.optimizer.SGD(learning_rate=0.5) optimizer = fluid.optimizer.SGD(learning_rate=0.5)
with self.assertRaises(ValueError):
optimizer = fluid.optimizer.PipelineOptimizer( optimizer = fluid.optimizer.PipelineOptimizer(
optimizer, dict(), num_microbatches=2)
cut_list=[],
#place_list=[fluid.CPUPlace()],
place_list=[fluid.CUDAPlace(0)],
concurrency_list=[1],
queue_size=1,
sync_steps=-1)
optimizer.minimize(loss)
program._pipeline_opt["dump_fields"] = ["fc.tmp_0", "fc.tmp_0@GRAD"]
program._pipeline_opt["dump_fields_path"] = "./dump_log/"
program._pipeline_opt["dump_param"] = ["embx"]
program._pipeline_opt["enable_random_dump"] = random_dump
program._pipeline_opt["dump_interval"] = 10
program._pipeline_opt["random_with_lineid"] = False
#print(program._pipeline_opt)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
#prepare data
batch_size = 100
def binary_print(slot, fout):
num = np.int16(len(slot) + 1)
num.tofile(fout)
a = np.int64(batch_size)
a.tofile(fout)
slot.tofile(fout)
#batch1 = np.array([[0,1], [1,2], [2,3]]).astype("int64").reshape(batch_size,2,1)
#batch2 = np.array([[1,2], [2,3], [3,4]]).astype("int64").reshape(batch_size,2,1)
batch1 = np.ones(
(batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
batch2 = np.ones(
(batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
data = [batch1, batch2]
filelist = []
for i in range(2):
filelist.append("test_pipeline_input_" + str(i))
for f in filelist:
with open(f, "wb") as fout:
for batch_data in data:
for ins in batch_data:
for slot in ins:
binary_print(slot, fout)
dataset = fluid.DatasetFactory().create_dataset(
"FileInstantDataset")
dataset.set_use_var([x, y])
dataset.set_batch_size(batch_size)
dataset.set_filelist(filelist)
for epoch in range(1):
exe.train_from_dataset(
fluid.default_main_program(),
dataset,
thread=1,
debug=True,
fetch_list=[],
fetch_info=[],
print_period=1)
for f in filelist:
os.remove(f)
if os.path.isdir("dump_log"):
shutil.rmtree("dump_log")
def test_pipeline(self):
self.single_section(True)
self.single_section(False)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册