未验证 提交 df927768 编写于 作者: F fengjiayi 提交者: GitHub

Merge pull request #7269 from emailweixu/calc_gradient

Calculating gradients for partial graph
...@@ -87,7 +87,11 @@ class GradOpDescMakerBase { ...@@ -87,7 +87,11 @@ class GradOpDescMakerBase {
auto onames = this->Output(name); auto onames = this->Output(name);
ret_val.reserve(onames.size()); ret_val.reserve(onames.size());
std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val), std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val),
GradVarName); [this](const std::string& fwd_var_name) -> std::string {
auto g_name = GradVarName(fwd_var_name);
(*this->grad_to_var_)[g_name] = fwd_var_name;
return g_name;
});
return ret_val; return ret_val;
} }
......
...@@ -129,7 +129,7 @@ class OpDesc { ...@@ -129,7 +129,7 @@ class OpDesc {
} }
proto::OpDesc desc_; proto::OpDesc desc_;
// input arg name => output variable names // input arg name => input variable names
VariableNameMap inputs_; VariableNameMap inputs_;
// output arg name => output variable names // output arg name => output variable names
VariableNameMap outputs_; VariableNameMap outputs_;
......
...@@ -39,7 +39,7 @@ class NormOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -39,7 +39,7 @@ class NormOpMaker : public framework::OpProtoAndCheckerMaker {
"M = C * H * W"); "M = C * H * W");
AddComment(R"DOC( AddComment(R"DOC(
"Input shape: $(N, C, H, W)$ "Input shape: $(N, C, H, W)$
Sclae shape: $(C, 1)$ Scale shape: $(C, 1)$
Output shape: $(N, C, H, W)$ Output shape: $(N, C, H, W)$
Where Where
forward forward
......
...@@ -66,7 +66,7 @@ class NormKernel : public framework::OpKernel<T> { ...@@ -66,7 +66,7 @@ class NormKernel : public framework::OpKernel<T> {
context.GetPlace()); context.GetPlace());
auto tmp = framework::EigenVector<T, Eigen::RowMajor, auto tmp = framework::EigenVector<T, Eigen::RowMajor,
Eigen::DenseIndex>::Flatten(tmp_tensor); Eigen::DenseIndex>::Flatten(tmp_tensor);
// get colsum and sqrt , inverse // get colsum and sqrt , inverse
auto dim = Eigen::array<int, 1>({{0}}); auto dim = Eigen::array<int, 1>({{0}});
tmp.device(*place) = x_square_batch_eigen.sum(dim); tmp.device(*place) = x_square_batch_eigen.sum(dim);
tmp.device(*place) = (tmp + epsilon).sqrt().inverse(); tmp.device(*place) = (tmp + epsilon).sqrt().inverse();
......
from paddle.v2.fluid import framework as framework from paddle.v2.fluid import framework as framework
from . import core from . import core
import collections import collections
import copy
__all__ = ['append_backward'] __all__ = ['append_backward', 'calc_gradient']
def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None): def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
...@@ -65,6 +66,18 @@ def _all_in_set_(cands, s): ...@@ -65,6 +66,18 @@ def _all_in_set_(cands, s):
return True return True
def _some_in_set_(cands, s):
"""
Test if some elements of 'cands' are in set 's'
"""
if len(cands) == 0:
return False
for c in cands:
if c in s:
return True
return False
def _strip_grad_suffix_(name): def _strip_grad_suffix_(name):
""" """
Strip the grad suffix from the given varibale name Strip the grad suffix from the given varibale name
...@@ -169,8 +182,8 @@ def _remove_no_grad_branch_(op_descs, no_grad_set): ...@@ -169,8 +182,8 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
return op_descs return op_descs
def _append_backward_ops_(target, def _append_backward_ops_(block,
block, ops,
target_block, target_block,
no_grad_dict, no_grad_dict,
grad_to_var, grad_to_var,
...@@ -179,8 +192,8 @@ def _append_backward_ops_(target, ...@@ -179,8 +192,8 @@ def _append_backward_ops_(target,
Create all grad ops, and insert them into given block Create all grad ops, and insert them into given block
Args: Args:
target(Variable): the target variable of forward pass
block(Block): the block where forward ops are block(Block): the block where forward ops are
ops(Op): the forward operators whose backward ops need to be added
target_block(Block): the block which is going to hold new generated grad ops target_block(Block): the block which is going to hold new generated grad ops
no_grad_dict(dict): no_grad_dict(dict):
key(int) block index key(int) block index
...@@ -202,14 +215,14 @@ def _append_backward_ops_(target, ...@@ -202,14 +215,14 @@ def _append_backward_ops_(target,
# grad_op_descs holds created grad_op, and will be appended to target_block # grad_op_descs holds created grad_op, and will be appended to target_block
grad_op_descs = [] grad_op_descs = []
program = block.program program = block.program
for op in reversed(block.ops): for op in reversed(ops):
grad_sub_block_list = [] grad_sub_block_list = []
# If the op has its own sub-block, deal with the sub-block first # If the op has its own sub-block, deal with the sub-block first
if op.has_attr("sub_block"): if op.has_attr("sub_block"):
sub_block = program.block(op.block_attr("sub_block")) sub_block = program.block(op.block_attr("sub_block"))
grad_sub_block = program.create_block(parent_idx=sub_block.idx) grad_sub_block = program.create_block(parent_idx=sub_block.idx)
_append_backward_ops_(target, sub_block, grad_sub_block, _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
no_grad_dict, grad_to_var, callback) no_grad_dict, grad_to_var)
grad_sub_block_list.append(grad_sub_block.desc) grad_sub_block_list.append(grad_sub_block.desc)
# Getting op's corresponding grad_op # Getting op's corresponding grad_op
...@@ -224,14 +237,6 @@ def _append_backward_ops_(target, ...@@ -224,14 +237,6 @@ def _append_backward_ops_(target,
grad_op_descs = _remove_no_grad_branch_(grad_op_descs, grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
no_grad_dict[block.idx]) no_grad_dict[block.idx])
if target_block.idx == 0:
grad_op_descs.insert(
0,
_create_op_desc_("fill_constant", {}, {
"Out": [_append_grad_suffix_(target.name)]
}, {"shape": [1],
"value": 1.0,
"dtype": target.dtype}))
# append op_desc in grad_op_descs to target_block # append op_desc in grad_op_descs to target_block
for op_desc in grad_op_descs: for op_desc in grad_op_descs:
new_op_desc = target_block.desc.append_op() new_op_desc = target_block.desc.append_op()
...@@ -252,7 +257,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): ...@@ -252,7 +257,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
In most cases, this dict is generated by _append_backward_ops_() In most cases, this dict is generated by _append_backward_ops_()
grad_info_map(dict)(output argument): grad_info_map(dict)(output argument):
key(str): forward variable name key(str): forward variable name
val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index val(tuple): a tuple of (str, Block), str is the corresponding grad name, Block is the block containing grad variable
""" """
for op_idx in range(start_op_idx, block.desc.op_size()): for op_idx in range(start_op_idx, block.desc.op_size()):
op_desc = block.desc.op(op_idx) op_desc = block.desc.op(op_idx)
...@@ -279,41 +284,63 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): ...@@ -279,41 +284,63 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
_infer_var_data_type_(arg, block) _infer_var_data_type_(arg, block)
def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
var_map = copy.copy(target_grad_map)
for op_idx in range(start_op_idx, block.desc.op_size()):
op_desc = block.desc.op(op_idx)
for name in op_desc.input_arg_names():
if name in var_map:
op_desc.rename_input(name, var_map[name])
for name in op_desc.output_arg_names():
if block.desc.find_var(name.encode("ascii")):
new_name = "%s_%s" % (name, core.unique_integer(name))
op_desc.rename_output(name, new_name)
var_map[name] = new_name
for g, ng in var_map.iteritems():
if g in grad_to_var:
grad_to_var[ng] = grad_to_var[g]
grad_to_var.pop(g)
def _get_stop_gradients_(program):
no_grad_dict = dict()
assert isinstance(program, framework.Program)
for block in program.blocks:
assert isinstance(block, framework.Block)
block_no_grad_set = set()
for var in block.vars.itervalues():
assert isinstance(var, framework.Variable)
if var.stop_gradient:
block_no_grad_set.add(_append_grad_suffix_(var.name))
no_grad_dict[block.idx] = block_no_grad_set
return no_grad_dict
def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None): def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None):
""" """
Append backward part to main_program Append backward part to main_program
Args: Args:
loss(Variable): The variable generated by cost function. loss(Variable): The variable generated by cost function.
parameter_list(list): Parameters that need to be updated by optimizer. parameter_list(list[string]): Parameters that need to be updated by
If None, it means all parameters need to be updated. optimizer. If None, it means all parameters need to be updated.
no_grad_set(set): Variables that have no gradients in Block 0. no_grad_set(set): Variables that have no gradients in Block 0.
If None, the set will be generated inside the function and All variables with `step_gradient=True` from all blocks will be
contains all variables with `step_gradient=True` from all blocks. automatically added.
Return: Return:
(list[Variable]): list of (parameters, gradients) pair. (list[(Variable,Variable)]): list of (parameter, gradient) pair.
""" """
assert isinstance(loss, framework.Variable) assert isinstance(loss, framework.Variable)
program = loss.block.program program = loss.block.program
no_grad_dict = dict()
if no_grad_set is None: if no_grad_set is None:
assert isinstance(program, framework.Program) no_grad_set = set()
for block in program.blocks: no_grad_set = copy.copy(no_grad_set)
assert isinstance(block, framework.Block) no_grad_dict = _get_stop_gradients_(program)
block_no_grad_set = set() no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
for var in block.vars.itervalues():
assert isinstance(var, framework.Variable)
if var.stop_gradient:
block_no_grad_set.add(_append_grad_suffix_(var.name))
no_grad_dict[block.idx] = block_no_grad_set
elif isinstance(no_grad_set, set):
no_grad_dict = {
0: set([_append_grad_suffix_(name) for name in no_grad_set])
}
else:
raise ValueError("'no_grad_set' should be a set or None.")
grad_info_map = dict() grad_info_map = dict()
root_block = program.block(0) root_block = program.block(0)
...@@ -322,8 +349,25 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None): ...@@ -322,8 +349,25 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None):
current_block_idx = program.current_block_idx current_block_idx = program.current_block_idx
grad_to_var = dict() grad_to_var = dict()
_append_backward_ops_(loss, root_block, root_block, no_grad_dict, op_desc = _create_op_desc_("fill_constant", {}, {
"Out": [_append_grad_suffix_(loss.name)]
}, {"shape": [1],
"value": 1.0,
"dtype": loss.dtype})
root_block.desc.append_op().copy_from(op_desc)
block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
_append_backward_ops_(root_block, op_path, root_block, no_grad_dict,
grad_to_var, callback) grad_to_var, callback)
# Because calc_gradient may be called multiple times,
# we need rename the internal gradient variables so that they have
# different names.
_rename_grad_(root_block, fwd_op_num, grad_to_var, {})
_append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map) _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
program.current_block_idx = current_block_idx program.current_block_idx = current_block_idx
...@@ -334,6 +378,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None): ...@@ -334,6 +378,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None):
else: else:
params = program.global_block().all_parameters() params = program.global_block().all_parameters()
parameters = [param.name for param in params] parameters = [param.name for param in params]
params_and_grads = [] params_and_grads = []
for param in parameters: for param in parameters:
if param not in grad_info_map: if param not in grad_info_map:
...@@ -351,3 +396,147 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None): ...@@ -351,3 +396,147 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None):
else: else:
params_and_grads.append((param_var, None)) params_and_grads.append((param_var, None))
return params_and_grads return params_and_grads
def _as_list(x):
if x is None:
return []
return list(x) if isinstance(x, collections.Sequence) else [x]
def _find_op_path_(block, outputs, inputs, no_grad_set):
"""
no_grad_set will also be changed
"""
input_names = set([inp.name for inp in inputs])
output_names = set([out.name for out in outputs])
relevant_op_flags = [True] * len(block.ops)
# All the inputs of the block are used if inputs is empty,
if inputs:
for i, op in enumerate(block.ops):
if _some_in_set_(op.desc.input_arg_names(), input_names):
for name in op.desc.output_arg_names():
if name not in no_grad_set:
input_names.add(name)
else:
relevant_op_flags[i] = False
for i, op in reversed(list(enumerate(block.ops))):
if _some_in_set_(op.desc.output_arg_names(), output_names):
for name in op.desc.input_arg_names():
if name not in no_grad_set:
output_names.add(name)
else:
relevant_op_flags[i] = False
op_path = [
block.ops[i] for i in range(len(block.ops)) if relevant_op_flags[i]
]
if inputs:
for op in op_path:
for name in op.desc.input_arg_names():
if name not in input_names:
no_grad_set.add(name)
return op_path
def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
"""
Backpropagate the graidents of targets to inputs.
Args:
targets(Variable|list[Variable]): The target variables
inputs(Variable|list[Variable]): The input variables
no_grad_set(set[string]): The names of variables that have no gradients
in Block 0. All variables with `stop_gradient=True` from all blocks
will be automatically added.
Return:
(list[Variable]): list of gradients for inputs
If an input does not affect targets, the corresponding gradient variable
will be None
"""
targets = _as_list(targets)
inputs = _as_list(inputs)
target_gradients = _as_list(target_gradients)
block = targets[0].block
prog = block.program
block_idx = block.idx
if not target_gradients:
target_gradients = [None] * len(targets)
if len(targets) != len(target_gradients):
raise ValueError(
"Should have the same number of target_gradients as targets")
if no_grad_set is None:
no_grad_set = set()
no_grad_set = copy.copy(no_grad_set)
no_grad_dict = _get_stop_gradients_(prog)
no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
fwd_op_num = block.desc.op_size()
target_grad_map = {}
for i, grad in enumerate(target_gradients):
target = targets[i]
if grad is None:
grad_name = _append_grad_suffix_(target.name)
op_desc = _create_op_desc_("fill_constant_batch_size_like",
{"Input": [target.name]},
{"Out": [grad_name]}, {
"shape": target.shape,
"value": 1.0,
"dtype": target.dtype,
'input_dim_idx': 0,
'output_dim_idx': 0
})
block.desc.append_op().copy_from(op_desc)
else:
if target.block.idx != block_idx or target.block.program != prog:
raise ValueError("all targets must be in the same block")
if target.shape != grad.shape:
raise ValueError(
"The shapes of target and grad are different: %s %s" % (
target.name, grad.name))
target_grad_map[_append_grad_suffix_(target.name)] = grad.name
for input in inputs:
if input.block.program != prog:
raise "input must be in the same program as targets"
block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
op_path = _find_op_path_(block, targets, inputs, block_no_grad_set)
no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
grad_to_var = dict()
grad_info_map = dict()
_append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var)
# Because calc_gradient may be called multiple times,
# we need rename the internal gradient variables so that they have
# different names.
_rename_grad_(block, fwd_op_num, grad_to_var, target_grad_map)
_append_backward_vars_(block, fwd_op_num, grad_to_var, grad_info_map)
prog.sync_with_cpp()
grad_vars = []
for input_var in inputs:
if input_var.name not in grad_info_map:
grad_vars.append(None)
else:
grad_info = grad_info_map[input_var.name]
grad_block = grad_info[1]
grad_var = grad_block.var(grad_info[0])
grad_vars.append(grad_var)
if len(grad_vars) == 1:
return grad_vars[0]
else:
return grad_vars
from ..registry import register_layer from ..registry import register_layer
__activations__ = [ __activations__ = [
'abs', 'tanh', 'sigmoid', 'relu', 'sqrt', 'ceil', 'floor', 'log', 'round' 'abs',
'ceil',
'exp',
'floor',
'log',
'relu',
'round',
'sigmoid',
'sqrt',
'square',
'tanh',
] ]
__all__ = [ __all__ = [
......
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from ..param_attr import ParamAttr
__all__ = [ __all__ = [
'create_tensor', 'cast', 'concat', 'sums', 'assign', 'create_tensor', 'create_parameter', 'cast', 'concat', 'sums', 'assign',
'fill_constant_batch_size_like', 'fill_constant', 'ones', 'zeros' 'fill_constant_batch_size_like', 'fill_constant', 'ones', 'zeros'
] ]
...@@ -11,6 +12,33 @@ def create_tensor(dtype, name=None): ...@@ -11,6 +12,33 @@ def create_tensor(dtype, name=None):
return helper.create_variable(name=helper.name, dtype=dtype) return helper.create_variable(name=helper.name, dtype=dtype)
def create_parameter(shape,
dtype,
attr=None,
is_bias=False,
default_initializer=None):
"""
Create a parameter
Args:
shape(list[int]): shape of the parameter
dtype(string): element type of the parameter
attr(ParamAttr): attributes of the parameter
is_bias(bool): This can affect which default initializer is chosen
when default_initializer is None. If is_bias,
initializer.Constant(0.0) will be used. Otherwise,
Xavier() will be used.
default_initializer(Initializer): initializer for the parameter
Returns:
Parameter: the created parameter
"""
helper = LayerHelper("create_parameter")
if attr is None:
attr = ParamAttr()
return helper.create_parameter(attr, shape, dtype, is_bias,
default_initializer)
def cast(x, dtype): def cast(x, dtype):
""" """
This function takes in the input with input_dtype This function takes in the input with input_dtype
...@@ -180,7 +208,8 @@ def fill_constant_batch_size_like(input, ...@@ -180,7 +208,8 @@ def fill_constant_batch_size_like(input,
Examples: Examples:
.. code-block:: python .. code-block:: python
data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64') data = fluid.layers.fill_constant_batch_size_like(
input=like, shape=[1], value=0, dtype='int64')
""" """
helper = LayerHelper("fill_constant_batch_size_like", **locals()) helper = LayerHelper("fill_constant_batch_size_like", **locals())
out = helper.create_tmp_variable(dtype=dtype) out = helper.create_tmp_variable(dtype=dtype)
......
import unittest
import paddle.v2.fluid as fluid
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.optimizer as optimizer
from paddle.v2.fluid.backward import calc_gradient
class TestCalcGradient(unittest.TestCase):
def test_calc_gradient(self):
x = layers.create_parameter(dtype="float32", shape=[5, 10])
y = layers.create_parameter(dtype="float32", shape=[10, 8])
mul_out = layers.mul(x=x, y=y)
mean_out = layers.mean(x=mul_out)
a = calc_gradient(mean_out, mul_out)
b = calc_gradient(mean_out, x)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
exe.run(fluid.default_main_program(), feed={}, fetch_list=[a, b])
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册