未验证 提交 ab5a1fb8 编写于 作者: L lilong12 提交者: GitHub

add device attr for regularizer, test=develop (#24981)

上级 0b6145e0
...@@ -715,8 +715,8 @@ class Optimizer(object): ...@@ -715,8 +715,8 @@ class Optimizer(object):
params_grads = append_gradient_clip_ops(params_grads) params_grads = append_gradient_clip_ops(params_grads)
# Add regularization if any # Add regularization if any
params_grads = append_regularization_ops(params_grads, params_grads = append_regularization_ops(
self.regularization) params_grads, self.regularization, self._param_device_map)
optimize_ops = self._create_optimization_pass(params_grads) optimize_ops = self._create_optimization_pass(params_grads)
if table_optimize_op is not None: if table_optimize_op is not None:
...@@ -1070,7 +1070,7 @@ class MomentumOptimizer(Optimizer): ...@@ -1070,7 +1070,7 @@ class MomentumOptimizer(Optimizer):
class DGCMomentumOptimizer(Optimizer): class DGCMomentumOptimizer(Optimizer):
""" """
:api_attr: Static Graph :api_attr: Static Graph
DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887 DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887
...@@ -2996,7 +2996,7 @@ Lamb = LambOptimizer ...@@ -2996,7 +2996,7 @@ Lamb = LambOptimizer
class ModelAverage(Optimizer): class ModelAverage(Optimizer):
""" """
:api_attr: Static Graph :api_attr: Static Graph
The ModelAverage optimizer accumulates specific continuous historical parameters The ModelAverage optimizer accumulates specific continuous historical parameters
during training. The accumulated historical range can be controlled by the passed during training. The accumulated historical range can be controlled by the passed
...@@ -3305,7 +3305,7 @@ class ModelAverage(Optimizer): ...@@ -3305,7 +3305,7 @@ class ModelAverage(Optimizer):
class ExponentialMovingAverage(object): class ExponentialMovingAverage(object):
""" """
:api_attr: Static Graph :api_attr: Static Graph
Compute the moving average of parameters with exponential decay. Compute the moving average of parameters with exponential decay.
Given a parameter :math:`\\theta`, its exponential moving average (EMA) Given a parameter :math:`\\theta`, its exponential moving average (EMA)
...@@ -3555,7 +3555,7 @@ class ExponentialMovingAverage(object): ...@@ -3555,7 +3555,7 @@ class ExponentialMovingAverage(object):
class PipelineOptimizer(object): class PipelineOptimizer(object):
""" """
:api_attr: Static Graph :api_attr: Static Graph
Pipeline Optimizer Pipeline Optimizer
...@@ -3857,7 +3857,7 @@ class PipelineOptimizer(object): ...@@ -3857,7 +3857,7 @@ class PipelineOptimizer(object):
class RecomputeOptimizer(Optimizer): class RecomputeOptimizer(Optimizer):
""" """
:api_attr: Static Graph :api_attr: Static Graph
Recompute Optimizer Wrapper Recompute Optimizer Wrapper
...@@ -3931,7 +3931,7 @@ class RecomputeOptimizer(Optimizer): ...@@ -3931,7 +3931,7 @@ class RecomputeOptimizer(Optimizer):
def load(self, stat_dict): def load(self, stat_dict):
""" """
:api_attr: Static Graph :api_attr: Static Graph
load function is not supported by Recompute Optimizer for now. load function is not supported by Recompute Optimizer for now.
:return: None :return: None
...@@ -4149,7 +4149,7 @@ class RecomputeOptimizer(Optimizer): ...@@ -4149,7 +4149,7 @@ class RecomputeOptimizer(Optimizer):
class LookaheadOptimizer(object): class LookaheadOptimizer(object):
""" """
:api_attr: Static Graph :api_attr: Static Graph
This implements the Lookahead optimizer of the This implements the Lookahead optimizer of the
paper : https://arxiv.org/abs/1907.08610. paper : https://arxiv.org/abs/1907.08610.
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import logging import logging
from . import framework from . import framework
from .framework import in_dygraph_mode, _varbase_creator from .framework import in_dygraph_mode, _varbase_creator, device_guard
from . import core from . import core
__all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer'] __all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer']
...@@ -62,7 +62,9 @@ def _create_regularization_of_grad(param, grad, regularization=None): ...@@ -62,7 +62,9 @@ def _create_regularization_of_grad(param, grad, regularization=None):
return new_grad return new_grad
def append_regularization_ops(parameters_and_grads, regularization=None): def append_regularization_ops(parameters_and_grads,
regularization=None,
param_device_map=None):
"""Create and add backward regularization Operators """Create and add backward regularization Operators
Creates and adds backward regularization operators in the BlockDesc. Creates and adds backward regularization operators in the BlockDesc.
...@@ -93,16 +95,19 @@ def append_regularization_ops(parameters_and_grads, regularization=None): ...@@ -93,16 +95,19 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
repeate_regularizer = False repeate_regularizer = False
with framework.name_scope('regularization'): with framework.name_scope('regularization'):
for param, grad in parameters_and_grads: for param, grad in parameters_and_grads:
device = param_device_map[
param.name] if param_device_map else None
if not repeate_regularizer and param.regularizer is not None and regularization is not None: if not repeate_regularizer and param.regularizer is not None and regularization is not None:
repeate_regularizer = True repeate_regularizer = True
logging.info( logging.info(
"If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. " "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
"The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!" "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
% regularization.__str__()) % regularization.__str__())
with param.block.program._optimized_guard([param, grad]): with device_guard(device):
new_grad = _create_regularization_of_grad(param, grad, with param.block.program._optimized_guard([param, grad]):
regularization) new_grad = _create_regularization_of_grad(
params_and_grads.append((param, new_grad)) param, grad, regularization)
params_and_grads.append((param, new_grad))
return params_and_grads return params_and_grads
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册