diff --git a/python/paddle/v2/fluid/clip.py b/python/paddle/v2/fluid/clip.py index eb75018d7798fc326bceba9c00d4fb14aa521d48..f0904e18ea34658a2de85ff2265a0625e0b80238 100644 --- a/python/paddle/v2/fluid/clip.py +++ b/python/paddle/v2/fluid/clip.py @@ -1,5 +1,6 @@ import functools import layers +from framework import Variable from . import core __all__ = [ @@ -44,7 +45,7 @@ def error_clip_callback(block, context): class BaseGradientClipAttr(object): - def process_context(self, context, p_g): + def process_context(self, context, param, grad): raise NotImplementedError() def create_operators(self, param, grad): @@ -52,7 +53,7 @@ class BaseGradientClipAttr(object): class NullGradientClipAttr(BaseGradientClipAttr): - def process_context(self, context, p_g): + def process_context(self, context, param, grad): pass def create_operators(self, param, grad): @@ -69,7 +70,7 @@ class GradientClipByValue(BaseGradientClipAttr): self.max = max self.min = min - def process_context(self, context, p_g): + def process_context(self, context, param, grad): pass def create_operators(self, param, grad): @@ -81,7 +82,7 @@ class GradientClipByNorm(BaseGradientClipAttr): def __init__(self, clip_norm): self.clip_norm = clip_norm - def process_context(self, context, p_g): + def process_context(self, context, param, grad): pass def create_operators(self, param, grad): @@ -89,6 +90,46 @@ class GradientClipByNorm(BaseGradientClipAttr): return param, new_grad +class GradientClipByGlobalNorm(BaseGradientClipAttr): + global_norm_var = None + clip_norm_var = None + ratio_var = None + + @classmethod + def init(cls, clip_norm): + cls.global_norm_var = layers.fill_constant( + shape=[1], dtype="float32", value=0.0) + cls.clip_norm_var = layers.fill_constant( + shape=[1], dtype="float32", value=clip_norm) + + def __init__(self): + if not (isinstance(self.__class__.global_norm_var, Variable) and + isinstance(self.__class__.clip_norm_var, Variable)): + raise ValueError( + "Class 'GradientClipByGlobalNorm' has not been properly initialized. Please call GradientClipByGlobalNorm.init() first." + ) + + def process_context(self, context, param, grad): + local_norm_var = layers.reduce_sum( + x=layers.pow(x=grad, factor=2), reduce_all=True) + layers.sums( + input=[local_norm_var, self.__class__.global_norm_var], + out=[self.__class__.global_norm_var]) + + def create_operators(self, param, grad): + if self.__class__.ratio_var is None: + self.__class__.global_norm_var = layers.sqrt( + x=self.__class__.global_norm_var) + self.__class__.ratio_var = layers.elementwise_div( + x=self.__class__.clip_norm_var, + y=layers.elementwise_max( + x=self.__class__.clip_norm_var, + y=self.__class__.global_norm_var)) + # 缺乏elementwise_max + # 没法将ratio_var送给scale_op。 + # new_grad = layers. + + def append_gradient_clip_ops(param_grad): context = dict() create_op_callbacks = [] @@ -98,10 +139,9 @@ def append_gradient_clip_ops(param_grad): clip_attr = NullGradientClipAttr() if not isinstance(clip_attr, BaseGradientClipAttr): raise TypeError( - "clip attribute should be an instance of BaseGradientClippingAttr" - ) + "clip attribute should be an instance of BaseGradientClipAttr") - clip_attr.process_context(context=context, p_g=param_grad) + clip_attr.process_context(context=context, param=p, grad=g) create_op_callbacks.append( functools.partial( clip_attr.create_operators, param=p, grad=g)) diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py index 884e84011d960cead150dffc51123bd98a7587b5..021b87828f3aeb511aad27e8789c50ca5ef44d56 100644 --- a/python/paddle/v2/fluid/layers/ops.py +++ b/python/paddle/v2/fluid/layers/ops.py @@ -1,23 +1,15 @@ from ..registry import register_layer __activations__ = [ - 'abs', 'tanh', 'sigmoid', 'relu', 'sqrt', 'ceil', 'floor', 'log', 'round' + 'abs', 'tanh', 'sigmoid', 'relu', 'sqrt', 'ceil', 'floor', 'log', 'round', + 'pow' ] __all__ = [ - 'mean', - 'mul', - 'reshape', - 'scale', - 'transpose', - 'sigmoid_cross_entropy_with_logits', - 'elementwise_add', - 'elementwise_div', - 'elementwise_sub', - 'elementwise_mul', - 'clip', - 'clip_by_norm', - 'sequence_softmax', + 'mean', 'mul', 'reshape', 'scale', 'transpose', + 'sigmoid_cross_entropy_with_logits', 'elementwise_add', 'elementwise_div', + 'elementwise_sub', 'elementwise_mul', 'clip', 'clip_by_norm', + 'sequence_softmax', 'reduce_sum' ] + __activations__ for _OP in set(__all__):