diff --git a/doc/design/error_clip.md b/doc/design/error_clip.md new file mode 100644 index 0000000000000000000000000000000000000000..8e845462cce2a29556bcb6010b08f00fbc3d99d7 --- /dev/null +++ b/doc/design/error_clip.md @@ -0,0 +1,87 @@ +# Error Clip + +## Overview + +Error clip is widely used in model training to prevent gradient exploding. It takes some specific rules to adjust variables' gradients and prevent them from being too large. With it, values of a gradient will be checked before they are taken by the next `grad_op` and be shrunk if necessary. +## Usage + +Users are allowed to assign different error clip methods or attributes to different `Variable`s. Users can specify it as a parameter of `Variable`'s constructor: + +```python +var = framework.Variable(..., error_clip=myErrorClip, ...) +``` + +The default value of `error_clip` is `None`, which means no error clip is employed. When it's not `None`, it should take an object of `BaseErrorClipAttr`'s derived class. So far, `BaseErrorClipAttr` has only one derived class: `ErrorClipByValue`, whose constructor is: + +```python +ErrorClipByValue(max, min=None) +``` + +`max` and `min` represent the maximal and minimal clip threshold respectively. In backward pass, all values of `var`'s gradient greater than `max` or less than `min` will be clipped to `max` and `min` respectively. When the `min` is None, the minimal threshold will be assigned with `-max` automatically. + +So we can enable the error clip with threshold `[-5.0, 5.0]` for variable `var` by: + +```python +var = framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...) +``` + +## Implementation + +The `BaseErrorClipAttr` and its derived class `ErrorClipByValue` are defined in *clip.py*. + +```python +class BaseErrorClipAttr(object): + def append_clip_op(self, block, grad_name): + raise NotImplementedError() + + +class ErrorClipByValue(BaseErrorClipAttr): + def __init__(self, max, min=None): + max = float(max) + if min is None: + min = -max + else: + min = float(min) + self.max = max + self.min = min + + def append_clip_op(self, block, grad_name): + block.append_op( + type="clip", + inputs={"X": grad_name}, + outputs={"Out": grad_name}, + attrs={"min": self.min, + "max": self.max}) +``` + +The `BaseErrorClipAttr` have one main member functions: `append_clip_op(self, block, grad_name)`. + +This function is used to create a `clip_op` and append it to the end of given `block`. For different error clip algorithm require different `clip_op`, the function is defined as virtual in the base class. All derived classes must implement their own versions of this function. + +These `clip_op`s should be inserted after `grad_op`s whose output gradients need to be clipped. It is equivalent to appending some `clip_op`s to the end of the target block every time a new `grad_op` is added. + +```python +for op_desc in grad_op_descs: + new_op_desc = target_block.desc.append_op() + new_op_desc.copy_from(op_desc) + callback(block=target_block, context=grad_to_var) +``` + +Here we employ a callback function to complete this kind of jobs. In `_append_backward_ops_` function, each time after a `grad_op` is added to the `target_block`, a callback function is invoked. The logic of `clip_op` appending can be implemented inside the callback function. + +The callback function for `clip_op` appending is defined in *clip.py*: + +```python +def error_clip_callback(block, context): + # the context is a grad_to_var map + grad_to_var = context + op_desc = block.desc.op(block.desc.op_size() - 1) + for grad_n in filter(lambda n: grad_to_var.has_key(n), + op_desc.output_arg_names()): + fwd_var = block.var_recursive(grad_to_var[grad_n]) + error_clip = getattr(fwd_var, "error_clip", None) + if error_clip is not None: + error_clip.append_clip_op(block, grad_n) +``` + +This function takes a `block` and a `context`(which is actually a grad\_to\_var map) as inputs. It checks each output of the last `OpDesc` in the `block`. Notice that the last `OpDesc` of the `block` must be a `grad_op` and its outputs must be some forward variables' gradients. If an output gradient's corresponding forward variable has an attribute of `error_clip`, `error_clip_callback` will call the `error_clip`'s `append_clip_op` function to append the required `clip_op` into the `block`. diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 66a7f737574c438a2b945bd4a49d8317bd460c80..8d0eb53b8e1ff6b0a05218c6d0c4e017a21b3fbb 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -188,7 +188,17 @@ def _append_backward_ops_(target, grad_to_var(dict)(output argument): key(str): grad variable name val(str): corresponding forward variable name + callback(callable object): a callable object used to decorate new generated grad ops """ + if callback is None: + + def empty_callback(block, context): + pass + + callback = empty_callback + elif not hasattr(callback, '__call__'): + raise ValueError("'callback' must be a callable object.") + # grad_op_descs holds created grad_op, and will be appended to target_block grad_op_descs = [] program = block.program @@ -226,6 +236,7 @@ def _append_backward_ops_(target, for op_desc in grad_op_descs: new_op_desc = target_block.desc.append_op() new_op_desc.copy_from(op_desc) + callback(block=target_block, context=grad_to_var) def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): @@ -268,7 +279,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): _infer_var_data_type_(arg, block) -def append_backward(loss, parameter_list=None, no_grad_set=None): +def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None): """ Append backward part to main_program @@ -312,7 +323,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None): grad_to_var = dict() _append_backward_ops_(loss, root_block, root_block, no_grad_dict, - grad_to_var) + grad_to_var, callback) _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map) program.current_block_idx = current_block_idx diff --git a/python/paddle/v2/fluid/clip.py b/python/paddle/v2/fluid/clip.py index d7ec2fbe13fe6d9158345099b8668afc5c7d4571..b1fd1c2b65f10010fa959dbb47b3fbab114db2f2 100644 --- a/python/paddle/v2/fluid/clip.py +++ b/python/paddle/v2/fluid/clip.py @@ -1,7 +1,46 @@ import functools import layers +from . import core -__all__ = ['GradientClipByValue', 'append_gradient_clip_ops'] +__all__ = [ + 'GradientClipByValue', 'append_gradient_clip_ops', 'error_clip_callback' +] + + +class BaseErrorClipAttr(object): + def append_clip_op(self, block, grad_name): + raise NotImplementedError() + + +class ErrorClipByValue(BaseErrorClipAttr): + def __init__(self, max, min=None): + max = float(max) + if min is None: + min = -max + else: + min = float(min) + self.max = max + self.min = min + + def append_clip_op(self, block, grad_name): + block.append_op( + type="clip", + inputs={"X": grad_name}, + outputs={"Out": grad_name}, + attrs={"min": self.min, + "max": self.max}) + + +def error_clip_callback(block, context): + # the context is a grad_to_var map + grad_to_var = context + op_desc = block.desc.op(block.desc.op_size() - 1) + for grad_n in filter(lambda n: grad_to_var.has_key(n), + op_desc.output_arg_names()): + fwd_var = block.var_recursive(grad_to_var[grad_n]) + error_clip = getattr(fwd_var, "error_clip", None) + if error_clip is not None: + error_clip.append_clip_op(block, grad_n) class BaseGradientClipAttr(object): diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index 2dfb8b624160499492b7247f0e1dd1fba793817e..85c1e6eb7ba37f71d79f1dd6c34539d0b1dcbf11 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -143,9 +143,11 @@ class Variable(object): dtype=None, lod_level=None, persistable=None, + error_clip=None, stop_gradient=False, **kwargs): self.block = block + self.error_clip = error_clip if name is None: name = Variable._unique_var_name_() @@ -622,6 +624,17 @@ class Block(object): raise ValueError("var %s not in this block" % name) return v + def var_recursive(self, name): + if self.has_var(name): + return self.var(name) + else: + if self.idx == 0: + raise ValueError("var %s is not in block(%d) nor its parents." % + name, self.idx) + else: + parent_block = self.program.block(self.parent_idx) + return parent_block.var_recursive(name) + def all_parameters(self): return list(self.iter_parameters()) @@ -740,6 +753,7 @@ class Block(object): optimize_attr=p.optimize_attr, regularizer=p.regularizer, clip_attr=p.clip_attr, + error_clip=p.error_clip, name=v.name) self.vars[new_p.name] = new_p diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py index ff3e5315a2c2b115e4ba563f60de4139f248e93a..40721b5e97a3ab2b6fe772635454105f5cdf7b6c 100644 --- a/python/paddle/v2/fluid/optimizer.py +++ b/python/paddle/v2/fluid/optimizer.py @@ -6,7 +6,7 @@ from framework import unique_name, program_guard from initializer import Constant from layer_helper import LayerHelper from regularizer import append_regularization_ops -from clip import append_gradient_clip_ops +from clip import append_gradient_clip_ops, error_clip_callback __all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad'] @@ -197,7 +197,8 @@ class Optimizer(object): This method combines interface `append_backward()` and `create_optimization_pass()` into one. """ - params_grads = append_backward(loss, parameter_list, no_grad_set) + params_grads = append_backward(loss, parameter_list, no_grad_set, + error_clip_callback) params_grads = append_gradient_clip_ops(params_grads)