diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py index d6774faf686bc618977a1a618385b139cfd82f22..439d60ad674b2587072a86bcb2252a435b997275 100644 --- a/python/paddle/fluid/regularizer.py +++ b/python/paddle/fluid/regularizer.py @@ -21,6 +21,46 @@ from . import core __all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer'] +def _create_regularization_of_grad(param, grad, regularization=None): + """ Create and add backward regularization Operators + + Function helper of append_regularization_ops. + """ + # If no gradient or no regularization is specified, then we don't need to do anything + if grad is None or (param.regularizer is None and regularization is None): + return grad + regularization_term = None + if param.regularizer is not None: + # Add variable for regularization term in grad block + regularization_term = param.regularizer(param, grad, grad.block) + elif regularization is not None: + regularization_term = regularization(param, grad, grad.block) + + assert regularization_term is not None + + new_grad = grad + if grad.type == core.VarDesc.VarType.SELECTED_ROWS: + # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization, + # the grad's type and name will be changed. But the gradient's name + # is used in ParallelExecutor Reduce mode, so I add a flag for + # the new_grad here. + new_grad = grad.block.create_var( + name=grad.name + core.kNewGradSuffix(), + dtype=param.dtype, + shape=param.shape, + lod_level=param.lod_level, + type=core.VarDesc.VarType.LOD_TENSOR) + + inputs = {"X": [grad, regularization_term]} + outputs = {"Out": [new_grad]} + if in_dygraph_mode(): + core.ops.sum(inputs, {}, outputs) + else: + grad.block.append_op(type='sum', inputs=inputs, outputs=outputs) + + return new_grad + + def append_regularization_ops(parameters_and_grads, regularization=None): """Create and add backward regularization Operators @@ -43,47 +83,18 @@ def append_regularization_ops(parameters_and_grads, regularization=None): Exception: Unknown regularization type """ params_and_grads = [] - for param, grad in parameters_and_grads: - # If no gradient then we don't need to do anything - if grad is None: - params_and_grads.append((param, grad)) - continue - with param.block.program._optimized_guard( - [param, grad]), framework.name_scope('regularization'): - regularization_term = None - if param.regularizer is not None: - # Add variable for regularization term in grad block - regularization_term = param.regularizer(param, grad, grad.block) - elif regularization is not None: - regularization_term = regularization(param, grad, grad.block) - - # If no regularization specified, then we don't need to do anything - if regularization_term is None: - params_and_grads.append((param, grad)) - continue - - new_grad = grad - if grad.type == core.VarDesc.VarType.SELECTED_ROWS: - # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization, - # the grad's type and name will be changed. But the gradient's name - # is used in ParallelExecutor Reduce mode, so I add a flag for - # the new_grad here. - new_grad = grad.block.create_var( - name=grad.name + core.kNewGradSuffix(), - dtype=param.dtype, - shape=param.shape, - lod_level=param.lod_level, - type=core.VarDesc.VarType.LOD_TENSOR) - - inputs = {"X": [grad, regularization_term]} - outputs = {"Out": [new_grad]} - if in_dygraph_mode(): - core.ops.sum(inputs, {}, outputs) - else: - grad.block.append_op(type='sum', inputs=inputs, outputs=outputs) - + if in_dygraph_mode(): + for param, grad in parameters_and_grads: + new_grad = _create_regularization_of_grad(param, grad, + regularization) params_and_grads.append((param, new_grad)) - + else: + with framework.name_scope('regularization'): + for param, grad in parameters_and_grads: + with param.block.program._optimized_guard([param, grad]): + new_grad = _create_regularization_of_grad(param, grad, + regularization) + params_and_grads.append((param, new_grad)) return params_and_grads