提交 585dec3d 编写于 作者: X xuwei06

Calculating gradients for partial graph

Added backward.calc_gradient to backpropagate gradient from given targets to inputs.
上级 0ef9dc61
......@@ -87,7 +87,11 @@ class GradOpDescMakerBase {
auto onames = this->Output(name);
ret_val.reserve(onames.size());
std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val),
GradVarName);
[this](const std::string& fwd_var_name) -> std::string {
auto g_name = GradVarName(fwd_var_name);
(*this->grad_to_var_)[g_name] = fwd_var_name;
return g_name;
});
return ret_val;
}
......
......@@ -129,7 +129,7 @@ class OpDesc {
}
proto::OpDesc desc_;
// input arg name => output variable names
// input arg name => input variable names
VariableNameMap inputs_;
// output arg name => output variable names
VariableNameMap outputs_;
......
from paddle.v2.fluid import framework as framework
from . import core
import collections
import copy
__all__ = ['append_backward']
__all__ = ['append_backward', 'calc_gradient']
def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
......@@ -65,6 +66,18 @@ def _all_in_set_(cands, s):
return True
def _some_in_set_(cands, s):
"""
Test if some elements of 'cands' are in set 's'
"""
if len(cands) == 0:
return False
for c in cands:
if c in s:
return True
return False
def _strip_grad_suffix_(name):
"""
Strip the grad suffix from the given varibale name
......@@ -169,8 +182,8 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
return op_descs
def _append_backward_ops_(target,
block,
def _append_backward_ops_(block,
ops,
target_block,
no_grad_dict,
grad_to_var,
......@@ -179,8 +192,8 @@ def _append_backward_ops_(target,
Create all grad ops, and insert them into given block
Args:
target(Variable): the target variable of forward pass
block(Block): the block where forward ops are
ops(Op): the forward operators whose backward ops need to be added
target_block(Block): the block which is going to hold new generated grad ops
no_grad_dict(dict):
key(int) block index
......@@ -202,14 +215,14 @@ def _append_backward_ops_(target,
# grad_op_descs holds created grad_op, and will be appended to target_block
grad_op_descs = []
program = block.program
for op in reversed(block.ops):
for op in reversed(ops):
grad_sub_block_list = []
# If the op has its own sub-block, deal with the sub-block first
if op.has_attr("sub_block"):
sub_block = program.block(op.block_attr("sub_block"))
grad_sub_block = program.create_block(parent_idx=sub_block.idx)
_append_backward_ops_(target, sub_block, grad_sub_block,
no_grad_dict, grad_to_var, callback)
_append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
no_grad_dict, grad_to_var)
grad_sub_block_list.append(grad_sub_block.desc)
# Getting op's corresponding grad_op
......@@ -224,14 +237,6 @@ def _append_backward_ops_(target,
grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
no_grad_dict[block.idx])
if target_block.idx == 0:
grad_op_descs.insert(
0,
_create_op_desc_("fill_constant", {}, {
"Out": [_append_grad_suffix_(target.name)]
}, {"shape": [1],
"value": 1.0,
"dtype": target.dtype}))
# append op_desc in grad_op_descs to target_block
for op_desc in grad_op_descs:
new_op_desc = target_block.desc.append_op()
......@@ -252,7 +257,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
In most cases, this dict is generated by _append_backward_ops_()
grad_info_map(dict)(output argument):
key(str): forward variable name
val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
val(tuple): a tuple of (str, Block), str is the corresponding grad name, Block is the block containing grad variable
"""
for op_idx in range(start_op_idx, block.desc.op_size()):
op_desc = block.desc.op(op_idx)
......@@ -279,41 +284,63 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
_infer_var_data_type_(arg, block)
def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
var_map = copy.copy(target_grad_map)
for op_idx in range(start_op_idx, block.desc.op_size()):
op_desc = block.desc.op(op_idx)
for name in op_desc.input_arg_names():
if name in var_map:
op_desc.rename_input(name, var_map[name])
for name in op_desc.output_arg_names():
if block.desc.find_var(name.encode("ascii")):
new_name = "%s_%s" % (name, core.unique_integer(name))
op_desc.rename_output(name, new_name)
var_map[name] = new_name
for g, ng in var_map.iteritems():
if g in grad_to_var:
grad_to_var[ng] = grad_to_var[g]
grad_to_var.pop(g)
def _get_stop_gradients_(program):
no_grad_dict = dict()
assert isinstance(program, framework.Program)
for block in program.blocks:
assert isinstance(block, framework.Block)
block_no_grad_set = set()
for var in block.vars.itervalues():
assert isinstance(var, framework.Variable)
if var.stop_gradient:
block_no_grad_set.add(_append_grad_suffix_(var.name))
no_grad_dict[block.idx] = block_no_grad_set
return no_grad_dict
def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None):
"""
Append backward part to main_program
Args:
loss(Variable): The variable generated by cost function.
parameter_list(list): Parameters that need to be updated by optimizer.
If None, it means all parameters need to be updated.
parameter_list(list[string]): Parameters that need to be updated by
optimizer. If None, it means all parameters need to be updated.
no_grad_set(set): Variables that have no gradients in Block 0.
If None, the set will be generated inside the function and
contains all variables with `step_gradient=True` from all blocks.
All variables with `step_gradient=True` from all blocks will be
automatically added.
Return:
(list[Variable]): list of (parameters, gradients) pair.
(list[(Variable,Variable)]): list of (parameter, gradient) pair.
"""
assert isinstance(loss, framework.Variable)
program = loss.block.program
no_grad_dict = dict()
if no_grad_set is None:
assert isinstance(program, framework.Program)
for block in program.blocks:
assert isinstance(block, framework.Block)
block_no_grad_set = set()
for var in block.vars.itervalues():
assert isinstance(var, framework.Variable)
if var.stop_gradient:
block_no_grad_set.add(_append_grad_suffix_(var.name))
no_grad_dict[block.idx] = block_no_grad_set
elif isinstance(no_grad_set, set):
no_grad_dict = {
0: set([_append_grad_suffix_(name) for name in no_grad_set])
}
else:
raise ValueError("'no_grad_set' should be a set or None.")
no_grad_set = set()
no_grad_set = copy.copy(no_grad_set)
no_grad_dict = _get_stop_gradients_(program)
no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
grad_info_map = dict()
root_block = program.block(0)
......@@ -322,8 +349,25 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None):
current_block_idx = program.current_block_idx
grad_to_var = dict()
_append_backward_ops_(loss, root_block, root_block, no_grad_dict,
op_desc = _create_op_desc_("fill_constant", {}, {
"Out": [_append_grad_suffix_(loss.name)]
}, {"shape": [1],
"value": 1.0,
"dtype": loss.dtype})
root_block.desc.append_op().copy_from(op_desc)
block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
_append_backward_ops_(root_block, op_path, root_block, no_grad_dict,
grad_to_var, callback)
# Because calc_gradient may be called multiple times,
# we need rename the internal gradient variables so that they have
# different names.
_rename_grad_(root_block, fwd_op_num, grad_to_var, {})
_append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
program.current_block_idx = current_block_idx
......@@ -334,6 +378,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None):
else:
params = program.global_block().all_parameters()
parameters = [param.name for param in params]
params_and_grads = []
for param in parameters:
if param not in grad_info_map:
......@@ -351,3 +396,147 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None):
else:
params_and_grads.append((param_var, None))
return params_and_grads
def _as_list(x):
if x is None:
return []
return list(x) if isinstance(x, collections.Sequence) else [x]
def _find_op_path_(block, outputs, inputs, no_grad_set):
"""
no_grad_set will also be changed
"""
input_names = set([inp.name for inp in inputs])
output_names = set([out.name for out in outputs])
relevant_op_flags = [True] * len(block.ops)
# All the inputs of the block are used if inputs is empty,
if inputs:
for i, op in enumerate(block.ops):
if _some_in_set_(op.desc.input_arg_names(), input_names):
for name in op.desc.output_arg_names():
if name not in no_grad_set:
input_names.add(name)
else:
relevant_op_flags[i] = False
for i, op in reversed(list(enumerate(block.ops))):
if _some_in_set_(op.desc.output_arg_names(), output_names):
for name in op.desc.input_arg_names():
if name not in no_grad_set:
output_names.add(name)
else:
relevant_op_flags[i] = False
op_path = [
block.ops[i] for i in range(len(block.ops)) if relevant_op_flags[i]
]
if inputs:
for op in op_path:
for name in op.desc.input_arg_names():
if name not in input_names:
no_grad_set.add(name)
return op_path
def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
"""
Backpropagate the graidents of targets to inputs.
Args:
targets(Variable|list[Variable]): The target variables
inputs(Variable|list[Variable]): The input variables
no_grad_set(set[string]): The names of variables that have no gradients
in Block 0. All variables with `stop_gradient=True` from all blocks
will be automatically added.
Return:
(list[Variable]): list of gradients for inputs
If an input does not affect targets, the corresponding gradient variable
will be None
"""
targets = _as_list(targets)
inputs = _as_list(inputs)
target_gradients = _as_list(target_gradients)
block = targets[0].block
prog = block.program
block_idx = block.idx
if not target_gradients:
target_gradients = [None] * len(targets)
if len(targets) != len(target_gradients):
raise ValueError(
"Should have the same number of target_gradients as targets")
if no_grad_set is None:
no_grad_set = set()
no_grad_set = copy.copy(no_grad_set)
no_grad_dict = _get_stop_gradients_(prog)
no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
fwd_op_num = block.desc.op_size()
target_grad_map = {}
for i, grad in enumerate(target_gradients):
target = targets[i]
if grad is None:
grad_name = _append_grad_suffix_(target.name)
op_desc = _create_op_desc_("fill_constant_batch_size_like",
{"Input": [target.name]},
{"Out": [grad_name]}, {
"shape": target.shape,
"value": 1.0,
"dtype": target.dtype,
'input_dim_idx': 0,
'output_dim_idx': 0
})
block.desc.append_op().copy_from(op_desc)
else:
if target.block.idx != block_idx or target.block.program != prog:
raise ValueError("all targets must be in the same block")
if target.shape != grad.shape:
raise ValueError(
"The shapes of target and grad are different: %s %s" % (
target.name, grad.name))
target_grad_map[_append_grad_suffix_(target.name)] = grad.name
for input in inputs:
if input.block.program != prog:
raise "input must be in the same program as targets"
block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
op_path = _find_op_path_(block, targets, inputs, block_no_grad_set)
no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
grad_to_var = dict()
grad_info_map = dict()
_append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var)
# Because calc_gradient may be called multiple times,
# we need rename the internal gradient variables so that they have
# different names.
_rename_grad_(block, fwd_op_num, grad_to_var, target_grad_map)
_append_backward_vars_(block, fwd_op_num, grad_to_var, grad_info_map)
prog.sync_with_cpp()
grad_vars = []
for input_var in inputs:
if input_var.name not in grad_info_map:
grad_vars.append(None)
else:
grad_info = grad_info_map[input_var.name]
grad_block = grad_info[1]
grad_var = grad_block.var(grad_info[0])
grad_vars.append(grad_var)
if len(grad_vars) == 1:
return grad_vars[0]
else:
return grad_vars
from ..registry import register_layer
__activations__ = [
'abs', 'tanh', 'sigmoid', 'relu', 'sqrt', 'ceil', 'floor', 'log', 'round'
'abs',
'ceil',
'exp',
'floor',
'log',
'relu',
'round',
'sigmoid',
'sqrt',
'square',
'tanh',
]
__all__ = [
......
from ..layer_helper import LayerHelper
from ..param_attr import ParamAttr
__all__ = [
'create_tensor', 'cast', 'concat', 'sums', 'assign',
'create_tensor', 'create_parameter', 'cast', 'concat', 'sums', 'assign',
'fill_constant_batch_size_like', 'fill_constant', 'ones', 'zeros'
]
......@@ -11,6 +12,33 @@ def create_tensor(dtype, name=None):
return helper.create_variable(name=helper.name, dtype=dtype)
def create_parameter(shape,
dtype,
attr=None,
is_bias=False,
default_initializer=None):
"""
Create a parameter
Args:
shape(list[int]): shape of the parameter
dtype(string): element type of the parameter
attr(ParamAttr): attributes of the parameter
is_bias(bool): This can affect which default initializer is chosen
when default_initializer is None. If is_bias,
initializer.Constant(0.0) will be used. Otherwise,
Xavier() will be used.
default_initializer(Initializer): initializer for the parameter
Returns:
Parameter: the created parameter
"""
helper = LayerHelper("create_parameter")
if attr is None:
attr = ParamAttr()
return helper.create_parameter(attr, shape, dtype, is_bias,
default_initializer)
def cast(x, dtype):
"""
This function takes in the input with input_dtype
......@@ -180,7 +208,8 @@ def fill_constant_batch_size_like(input,
Examples:
.. code-block:: python
data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
data = fluid.layers.fill_constant_batch_size_like(
input=like, shape=[1], value=0, dtype='int64')
"""
helper = LayerHelper("fill_constant_batch_size_like", **locals())
out = helper.create_tmp_variable(dtype=dtype)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册