未验证 提交 4137c46e 编写于 作者: S sneaxiy 提交者: GitHub

fix multi_tensor adam/momentum bug (#47352)

上级 40f15952
...@@ -1104,14 +1104,22 @@ class TestMultiTensorAdam(unittest.TestCase): ...@@ -1104,14 +1104,22 @@ class TestMultiTensorAdam(unittest.TestCase):
multi_precision=use_amp, multi_precision=use_amp,
) )
else: else:
parameters = list(model.parameters())
param_num = len(parameters)
optimizer = paddle.optimizer.Adam( optimizer = paddle.optimizer.Adam(
parameters=[ parameters=[
{ {
'params': model.parameters(), 'params': parameters[: int(param_num / 2)],
'weight_decay': 0.001, 'weight_decay': 0.001,
'beta1': 0.1, 'beta1': 0.1,
'beta2': 0.99, 'beta2': 0.99,
} },
{
'params': parameters[int(param_num / 2) :],
'weight_decay': 0.001,
'beta1': 0.1,
'beta2': 0.99,
},
], ],
use_multi_tensor=use_multi_tensor, use_multi_tensor=use_multi_tensor,
multi_precision=use_amp, multi_precision=use_amp,
......
...@@ -889,14 +889,22 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase): ...@@ -889,14 +889,22 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
multi_precision=use_amp, multi_precision=use_amp,
) )
else: else:
parameters = list(model.parameters())
n = len(parameters)
optimizer = paddle.optimizer.Momentum( optimizer = paddle.optimizer.Momentum(
parameters=[ parameters=[
{ {
'params': model.parameters(), 'params': parameters[: int(n / 2)],
'weight_decay': 0.001, 'weight_decay': 0.001,
'learning_rate': 0.1, 'learning_rate': 0.1,
'momentum': 0.99, 'momentum': 0.99,
} },
{
'params': parameters[int(n / 2) :],
'weight_decay': 0.001,
'learning_rate': 0.1,
'momentum': 0.99,
},
], ],
use_multi_tensor=use_multi_tensor, use_multi_tensor=use_multi_tensor,
multi_precision=use_amp, multi_precision=use_amp,
......
...@@ -217,21 +217,13 @@ class Adam(Optimizer): ...@@ -217,21 +217,13 @@ class Adam(Optimizer):
self._use_multi_tensor = use_multi_tensor self._use_multi_tensor = use_multi_tensor
if self._use_multi_tensor: if self._use_multi_tensor:
self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []} self._param_dict = self._create_multi_tensor_dict()
self._moment1_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []} self._moment1_dict = self._create_multi_tensor_dict()
self._moment2_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []} self._moment2_dict = self._create_multi_tensor_dict()
self._beta1_pow_acc_dict = { self._beta1_pow_acc_dict = self._create_multi_tensor_dict()
'FP32_LODTensor': [], self._beta2_pow_acc_dict = self._create_multi_tensor_dict()
'FP16_LODTensor': [], self._master_weight_dict = self._create_multi_tensor_dict()
} self._master_weight_dict['FP32_LODTensor'] = None
self._beta2_pow_acc_dict = {
'FP32_LODTensor': [],
'FP16_LODTensor': [],
}
self._master_weight_dict = {
'FP32_LODTensor': None,
'FP16_LODTensor': [],
}
def _create_master_weight(self, param): def _create_master_weight(self, param):
if param.name in self._master_weights: if param.name in self._master_weights:
...@@ -550,11 +542,14 @@ class Adam(Optimizer): ...@@ -550,11 +542,14 @@ class Adam(Optimizer):
params_grads.append((param, grad_var)) params_grads.append((param, grad_var))
optimize_ops = self._apply_optimize( optimize_ops = self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads loss=None,
startup_program=None,
params_grads=params_grads,
param_group_idx=0,
) )
else: else:
# optimize parameters in groups # optimize parameters in groups
for param_group in self._param_groups: for idx, param_group in enumerate(self._param_groups):
params_grads = defaultdict(lambda: list()) params_grads = defaultdict(lambda: list())
for param in param_group['params']: for param in param_group['params']:
if param.stop_gradient: if param.stop_gradient:
...@@ -566,10 +561,13 @@ class Adam(Optimizer): ...@@ -566,10 +561,13 @@ class Adam(Optimizer):
{k: v for k, v in param_group.items() if k != 'params'} {k: v for k, v in param_group.items() if k != 'params'}
) )
self._apply_optimize( self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads loss=None,
startup_program=None,
params_grads=params_grads,
param_group_idx=idx,
) )
def _multi_tensor_init(self, target_block, parameters): def _multi_tensor_init(self, target_block, parameters, param_group_idx):
""" """
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32). All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
This function will be overridden in the corresponding optimizer file. This function will be overridden in the corresponding optimizer file.
...@@ -589,21 +587,41 @@ class Adam(Optimizer): ...@@ -589,21 +587,41 @@ class Adam(Optimizer):
) )
if param.dtype == paddle.float32: if param.dtype == paddle.float32:
self._param_dict['FP32_LODTensor'].append(param) self._param_dict['FP32_LODTensor'][param_group_idx].append(
self._moment1_dict['FP32_LODTensor'].append(moment1) param
self._moment2_dict['FP32_LODTensor'].append(moment2) )
self._beta1_pow_acc_dict['FP32_LODTensor'].append(beta1_pow_acc) self._moment1_dict['FP32_LODTensor'][param_group_idx].append(
self._beta2_pow_acc_dict['FP32_LODTensor'].append(beta2_pow_acc) moment1
)
self._moment2_dict['FP32_LODTensor'][param_group_idx].append(
moment2
)
self._beta1_pow_acc_dict['FP32_LODTensor'][
param_group_idx
].append(beta1_pow_acc)
self._beta2_pow_acc_dict['FP32_LODTensor'][
param_group_idx
].append(beta2_pow_acc)
elif param.dtype == paddle.float16: elif param.dtype == paddle.float16:
self._param_dict['FP16_LODTensor'].append(param) self._param_dict['FP16_LODTensor'][param_group_idx].append(
self._moment1_dict['FP16_LODTensor'].append(moment1) param
self._moment2_dict['FP16_LODTensor'].append(moment2) )
self._beta1_pow_acc_dict['FP16_LODTensor'].append(beta1_pow_acc) self._moment1_dict['FP16_LODTensor'][param_group_idx].append(
self._beta2_pow_acc_dict['FP16_LODTensor'].append(beta2_pow_acc) moment1
if self._multi_precision:
self._master_weight_dict['FP16_LODTensor'].append(
self._master_weights[param.name]
) )
self._moment2_dict['FP16_LODTensor'][param_group_idx].append(
moment2
)
self._beta1_pow_acc_dict['FP16_LODTensor'][
param_group_idx
].append(beta1_pow_acc)
self._beta2_pow_acc_dict['FP16_LODTensor'][
param_group_idx
].append(beta2_pow_acc)
if self._multi_precision:
self._master_weight_dict['FP16_LODTensor'][
param_group_idx
].append(self._master_weights[param.name])
else: else:
self._master_weight_dict['FP16_LODTensor'] = None self._master_weight_dict['FP16_LODTensor'] = None
else: else:
...@@ -612,7 +630,10 @@ class Adam(Optimizer): ...@@ -612,7 +630,10 @@ class Adam(Optimizer):
) )
def _append_optimize_multi_tensor_op( def _append_optimize_multi_tensor_op(
self, target_block, parameters_and_grads self,
target_block,
parameters_and_grads,
param_group_idx,
): ):
""" """
For Multi Tensor, append optimize merged_operator to block. For Multi Tensor, append optimize merged_operator to block.
...@@ -677,7 +698,7 @@ class Adam(Optimizer): ...@@ -677,7 +698,7 @@ class Adam(Optimizer):
multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor'] multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
for key in multi_tensor_list: for key in multi_tensor_list:
if len(self._param_dict[key]) > 0: if len(self._param_dict[key][param_group_idx]) > 0:
find_master = self._multi_precision and key == 'FP16_LODTensor' find_master = self._multi_precision and key == 'FP16_LODTensor'
_beta1 = ( _beta1 = (
...@@ -692,16 +713,23 @@ class Adam(Optimizer): ...@@ -692,16 +713,23 @@ class Adam(Optimizer):
) )
if framework._non_static_mode(): if framework._non_static_mode():
master_weight = self._master_weight_dict[key]
master_weight = (
master_weight[param_group_idx]
if master_weight is not None
else None
)
if in_dygraph_mode(): if in_dygraph_mode():
_, _, _, _, _, _ = _C_ops.merged_adam_( _, _, _, _, _, _ = _C_ops.merged_adam_(
self._param_dict[key], self._param_dict[key][param_group_idx],
grad_dict[key], grad_dict[key],
lr_dict[key], lr_dict[key],
self._moment1_dict[key], self._moment1_dict[key][param_group_idx],
self._moment2_dict[key], self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key], self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key], self._beta2_pow_acc_dict[key][param_group_idx],
self._master_weight_dict[key], master_weight,
_beta1, _beta1,
_beta2, _beta2,
self._epsilon, self._epsilon,
...@@ -710,20 +738,20 @@ class Adam(Optimizer): ...@@ -710,20 +738,20 @@ class Adam(Optimizer):
) )
else: else:
_, _, _, _, _, _ = _legacy_C_ops.merged_adam( _, _, _, _, _, _ = _legacy_C_ops.merged_adam(
self._param_dict[key], self._param_dict[key][param_group_idx],
grad_dict[key], grad_dict[key],
lr_dict[key], lr_dict[key],
self._moment1_dict[key], self._moment1_dict[key][param_group_idx],
self._moment2_dict[key], self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key], self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key], self._beta2_pow_acc_dict[key][param_group_idx],
self._master_weight_dict[key], master_weight,
self._param_dict[key], self._param_dict[key][param_group_idx],
self._moment1_dict[key], self._moment1_dict[key][param_group_idx],
self._moment2_dict[key], self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key], self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key], self._beta2_pow_acc_dict[key][param_group_idx],
self._master_weight_dict[key], master_weight,
'epsilon', 'epsilon',
self._epsilon, self._epsilon,
'beta1', 'beta1',
...@@ -735,20 +763,28 @@ class Adam(Optimizer): ...@@ -735,20 +763,28 @@ class Adam(Optimizer):
) )
else: else:
inputs = { inputs = {
"Param": self._param_dict[key], "Param": self._param_dict[key][param_group_idx],
"Grad": grad_dict[key], "Grad": grad_dict[key],
"LearningRate": lr_dict[key], "LearningRate": lr_dict[key],
"Moment1": self._moment1_dict[key], "Moment1": self._moment1_dict[key][param_group_idx],
"Moment2": self._moment2_dict[key], "Moment2": self._moment2_dict[key][param_group_idx],
"Beta1Pow": self._beta1_pow_acc_dict[key], "Beta1Pow": self._beta1_pow_acc_dict[key][
"Beta2Pow": self._beta2_pow_acc_dict[key], param_group_idx
],
"Beta2Pow": self._beta2_pow_acc_dict[key][
param_group_idx
],
} }
outputs = { outputs = {
"ParamOut": self._param_dict[key], "ParamOut": self._param_dict[key][param_group_idx],
"Moment1Out": self._moment1_dict[key], "Moment1Out": self._moment1_dict[key][param_group_idx],
"Moment2Out": self._moment2_dict[key], "Moment2Out": self._moment2_dict[key][param_group_idx],
"Beta1PowOut": self._beta1_pow_acc_dict[key], "Beta1PowOut": self._beta1_pow_acc_dict[key][
"Beta2PowOut": self._beta2_pow_acc_dict[key], param_group_idx
],
"Beta2PowOut": self._beta2_pow_acc_dict[key][
param_group_idx
],
} }
attrs = { attrs = {
"epsilon": self._epsilon, "epsilon": self._epsilon,
...@@ -756,10 +792,12 @@ class Adam(Optimizer): ...@@ -756,10 +792,12 @@ class Adam(Optimizer):
"beta2": _beta2, "beta2": _beta2,
} }
if find_master: if find_master:
inputs["MasterParam"] = self._master_weight_dict[key] inputs["MasterParam"] = self._master_weight_dict[key][
param_group_idx
]
outputs["MasterParamOut"] = self._master_weight_dict[ outputs["MasterParamOut"] = self._master_weight_dict[
key key
] ][param_group_idx]
attrs["multi_precision"] = find_master attrs["multi_precision"] = find_master
target_block.append_op( target_block.append_op(
type="merged_adam", type="merged_adam",
......
...@@ -184,20 +184,12 @@ class Momentum(Optimizer): ...@@ -184,20 +184,12 @@ class Momentum(Optimizer):
} }
self._use_multi_tensor = use_multi_tensor self._use_multi_tensor = use_multi_tensor
if self._use_multi_tensor: if self._use_multi_tensor:
self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []} self._param_dict = self._create_multi_tensor_dict()
self._velocity_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []} self._velocity_dict = self._create_multi_tensor_dict()
self._master_weight_dict = { self._master_weight_dict = self._create_multi_tensor_dict()
'FP32_LODTensor': None, self._master_weight_dict['FP32_LODTensor'] = None
'FP16_LODTensor': [], self._regularization_method_dict = self._create_multi_tensor_dict()
} self._regularization_coeff_dict = self._create_multi_tensor_dict()
self._regularization_method_dict = {
'FP32_LODTensor': [],
'FP16_LODTensor': [],
}
self._regularization_coeff_dict = {
'FP32_LODTensor': [],
'FP16_LODTensor': [],
}
def _update_regularization(self, weight_decay): def _update_regularization(self, weight_decay):
reg_method = "" reg_method = ""
...@@ -420,7 +412,7 @@ class Momentum(Optimizer): ...@@ -420,7 +412,7 @@ class Momentum(Optimizer):
return momentum_op return momentum_op
def _multi_tensor_init(self, target_block, parameters): def _multi_tensor_init(self, target_block, parameters, param_group_idx):
""" """
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32). All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
This function will be overridden in the corresponding optimizer file. This function will be overridden in the corresponding optimizer file.
...@@ -445,37 +437,50 @@ class Momentum(Optimizer): ...@@ -445,37 +437,50 @@ class Momentum(Optimizer):
regularization_method = "" regularization_method = ""
regularization_coeff = 0.0 regularization_coeff = 0.0
if param.dtype == paddle.float32: if param.dtype == paddle.float32:
self._param_dict['FP32_LODTensor'].append(param) self._param_dict['FP32_LODTensor'][param_group_idx].append(
self._velocity_dict['FP32_LODTensor'].append(velocity_acc) param
# fp32 no master weight
self._regularization_method_dict['FP32_LODTensor'].append(
regularization_method
) )
self._regularization_coeff_dict['FP32_LODTensor'].append( self._velocity_dict['FP32_LODTensor'][param_group_idx].append(
regularization_coeff velocity_acc
) )
# fp32 no master weight
self._regularization_method_dict['FP32_LODTensor'][
param_group_idx
].append(regularization_method)
self._regularization_coeff_dict['FP32_LODTensor'][
param_group_idx
].append(regularization_coeff)
elif param.dtype == paddle.float16: elif param.dtype == paddle.float16:
self._param_dict['FP16_LODTensor'].append(param) self._param_dict['FP16_LODTensor'][param_group_idx].append(
self._velocity_dict['FP16_LODTensor'].append(velocity_acc) param
if self._multi_precision:
self._master_weight_dict['FP16_LODTensor'].append(
self._master_weights[param.name]
) )
else: self._velocity_dict['FP16_LODTensor'][param_group_idx].append(
self._master_weight_dict['FP16_LODTensor'] = None velocity_acc
self._regularization_method_dict['FP16_LODTensor'].append(
regularization_method
)
self._regularization_coeff_dict['FP16_LODTensor'].append(
regularization_coeff
) )
if self._multi_precision:
self._master_weight_dict['FP16_LODTensor'][
param_group_idx
].append(self._master_weights[param.name])
else:
self._master_weight_dict['FP16_LODTensor'][
param_group_idx
] = None
self._regularization_method_dict['FP16_LODTensor'][
param_group_idx
].append(regularization_method)
self._regularization_coeff_dict['FP16_LODTensor'][
param_group_idx
].append(regularization_coeff)
else: else:
raise ValueError( raise ValueError(
"Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR." "Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR."
) )
def _append_optimize_multi_tensor_op( def _append_optimize_multi_tensor_op(
self, target_block, parameters_and_grads self,
target_block,
parameters_and_grads,
param_group_idx,
): ):
""" """
For Multi Tensor, append optimize merged_operator to block. For Multi Tensor, append optimize merged_operator to block.
...@@ -540,71 +545,92 @@ class Momentum(Optimizer): ...@@ -540,71 +545,92 @@ class Momentum(Optimizer):
multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor'] multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
for key in multi_tensor_list: for key in multi_tensor_list:
if len(self._param_dict[key]) > 0: if len(self._param_dict[key][param_group_idx]) > 0:
find_master = self._multi_precision and key == 'FP16_LODTensor' find_master = self._multi_precision and key == 'FP16_LODTensor'
master_weight = self._master_weight_dict[key]
master_weight = (
master_weight[param_group_idx]
if master_weight is not None
else None
)
if framework._non_static_mode(): if framework._non_static_mode():
if in_dygraph_mode(): if in_dygraph_mode():
_, _, _ = _C_ops.merged_momentum_( _, _, _ = _C_ops.merged_momentum_(
self._param_dict[key], self._param_dict[key][param_group_idx],
grad_dict[key], grad_dict[key],
self._velocity_dict[key], self._velocity_dict[key][param_group_idx],
lr_dict[key], lr_dict[key],
self._master_weight_dict[key], master_weight,
self._momentum, self._momentum,
self._use_nesterov, self._use_nesterov,
self._regularization_method_dict[key], self._regularization_method_dict[key][
self._regularization_coeff_dict[key], param_group_idx
],
self._regularization_coeff_dict[key][
param_group_idx
],
find_master, find_master,
self._rescale_grad, self._rescale_grad,
) )
else: else:
_, _, _ = _legacy_C_ops.merged_momentum( _, _, _ = _legacy_C_ops.merged_momentum(
self._param_dict[key], self._param_dict[key][param_group_idx],
grad_dict[key], grad_dict[key],
self._velocity_dict[key], self._velocity_dict[key][param_group_idx],
lr_dict[key], lr_dict[key],
self._master_weight_dict[key], master_weight,
self._param_dict[key], self._param_dict[key][param_group_idx],
self._velocity_dict[key], self._velocity_dict[key][param_group_idx],
self._master_weight_dict[key], master_weight,
'mu', 'mu',
self._momentum, self._momentum,
'use_nesterov', 'use_nesterov',
self._use_nesterov, self._use_nesterov,
'regularization_method', 'regularization_method',
self._regularization_method_dict[key], self._regularization_method_dict[key][
param_group_idx
],
'regularization_coeff', 'regularization_coeff',
self._regularization_coeff_dict[key], self._regularization_coeff_dict[key][
param_group_idx
],
'multi_precision', 'multi_precision',
find_master, find_master,
) )
else: else:
inputs = { inputs = {
"Param": self._param_dict[key], "Param": self._param_dict[key][param_group_idx],
"Grad": grad_dict[key], "Grad": grad_dict[key],
"Velocity": self._velocity_dict[key], "Velocity": self._velocity_dict[key][param_group_idx],
"LearningRate": lr_dict[key], "LearningRate": lr_dict[key],
} }
outputs = { outputs = {
"ParamOut": self._param_dict[key], "ParamOut": self._param_dict[key][param_group_idx],
"VelocityOut": self._velocity_dict[key], "VelocityOut": self._velocity_dict[key][
param_group_idx
],
} }
attrs = { attrs = {
"mu": self._momentum, "mu": self._momentum,
"use_nesterov": self._use_nesterov, "use_nesterov": self._use_nesterov,
"regularization_method": self._regularization_method_dict[ "regularization_method": self._regularization_method_dict[
key key
][
param_group_idx
], ],
"regularization_coeff": self._regularization_coeff_dict[ "regularization_coeff": self._regularization_coeff_dict[
key key
], ][param_group_idx],
} }
if find_master: if find_master:
inputs["MasterParam"] = self._master_weight_dict[key] inputs["MasterParam"] = self._master_weight_dict[key][
param_group_idx
]
outputs["MasterParamOut"] = self._master_weight_dict[ outputs["MasterParamOut"] = self._master_weight_dict[
key key
] ][param_group_idx]
attrs["multi_precision"] = find_master attrs["multi_precision"] = find_master
target_block.append_op( target_block.append_op(
type="merged_momentum", type="merged_momentum",
......
...@@ -282,13 +282,20 @@ class Optimizer(object): ...@@ -282,13 +282,20 @@ class Optimizer(object):
# NOTE: Multi Tensor: Pass in all parameters and gradients to the op kernel of the Optimizer at one time for updating for dygraph mode. # NOTE: Multi Tensor: Pass in all parameters and gradients to the op kernel of the Optimizer at one time for updating for dygraph mode.
# Optimizer support list: [ paddle.optimizer.Momentum, paddle.optimizer.Adam]. # Optimizer support list: [ paddle.optimizer.Momentum, paddle.optimizer.Adam].
self._use_multi_tensor = None self._use_multi_tensor = None
self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
self._param_dict = self._create_multi_tensor_dict()
self._auxiliary_vars = {} self._auxiliary_vars = {}
def _set_auxiliary_var(self, key, val): def _set_auxiliary_var(self, key, val):
self._auxiliary_vars[key] = val self._auxiliary_vars[key] = val
def _create_multi_tensor_dict(self):
n = len(self._param_groups) if self._param_groups is not None else 1
return {
'FP32_LODTensor': [[] for _ in range(n)],
'FP16_LODTensor': [[] for _ in range(n)],
}
def _get_auxiliary_var(self, key): def _get_auxiliary_var(self, key):
return self._auxiliary_vars.get(key, None) return self._auxiliary_vars.get(key, None)
...@@ -779,7 +786,9 @@ class Optimizer(object): ...@@ -779,7 +786,9 @@ class Optimizer(object):
device = self._param_device_map[param_name] device = self._param_device_map[param_name]
return device return device
def _create_optimization_pass(self, parameters_and_grads): def _create_optimization_pass(
self, parameters_and_grads, param_group_idx=0
):
"""Add optimization operators to update gradients to tensors. """Add optimization operators to update gradients to tensors.
Args: Args:
...@@ -825,10 +834,12 @@ class Optimizer(object): ...@@ -825,10 +834,12 @@ class Optimizer(object):
'Adam', 'Adam',
]: ]:
if ( if (
len(self._param_dict['FP32_LODTensor']) == 0 len(self._param_dict['FP32_LODTensor'][param_group_idx]) == 0
and len(self._param_dict['FP16_LODTensor']) == 0 and len(self._param_dict['FP16_LODTensor'][param_group_idx])
== 0
): ):
if isinstance(parameters_and_grads, list): if isinstance(parameters_and_grads, list):
assert param_group_idx == 0
self._multi_tensor_init( self._multi_tensor_init(
target_block, target_block,
[ [
...@@ -836,6 +847,7 @@ class Optimizer(object): ...@@ -836,6 +847,7 @@ class Optimizer(object):
for p in parameters_and_grads for p in parameters_and_grads
if not p[0].stop_gradient if not p[0].stop_gradient
], ],
param_group_idx,
) )
else: else:
self._update_param_group(parameters_and_grads) self._update_param_group(parameters_and_grads)
...@@ -846,10 +858,13 @@ class Optimizer(object): ...@@ -846,10 +858,13 @@ class Optimizer(object):
for p in parameters_and_grads['params'] for p in parameters_and_grads['params']
if not p[0].stop_gradient if not p[0].stop_gradient
], ],
param_group_idx,
) )
if framework._non_static_mode(): if framework._non_static_mode():
self._append_optimize_multi_tensor_op( self._append_optimize_multi_tensor_op(
target_block, parameters_and_grads target_block,
parameters_and_grads,
param_group_idx=param_group_idx,
) )
else: else:
self._update_param_device_map( self._update_param_device_map(
...@@ -871,7 +886,9 @@ class Optimizer(object): ...@@ -871,7 +886,9 @@ class Optimizer(object):
device = self._get_device_for_param(param_grad_list[0].name) device = self._get_device_for_param(param_grad_list[0].name)
with device_guard(device): with device_guard(device):
self._append_optimize_multi_tensor_op( self._append_optimize_multi_tensor_op(
target_block, parameters_and_grads target_block,
parameters_and_grads,
param_group_idx=param_group_idx,
) )
else: else:
if not framework._non_static_mode(): if not framework._non_static_mode():
...@@ -1095,7 +1112,9 @@ class Optimizer(object): ...@@ -1095,7 +1112,9 @@ class Optimizer(object):
optimize_ops = self._create_optimization_pass(params_grads) optimize_ops = self._create_optimization_pass(params_grads)
return optimize_ops return optimize_ops
def _apply_optimize(self, loss, startup_program, params_grads): def _apply_optimize(
self, loss, startup_program, params_grads, param_group_idx=0
):
""" """
Second part of `minimize`, appending optimization operators for Second part of `minimize`, appending optimization operators for
given `params_grads` pairs. given `params_grads` pairs.
...@@ -1128,8 +1147,11 @@ class Optimizer(object): ...@@ -1128,8 +1147,11 @@ class Optimizer(object):
params_grads['params'] = self.append_regularization_ops( params_grads['params'] = self.append_regularization_ops(
params_grads['params'], self.regularization params_grads['params'], self.regularization
) )
optimize_ops = self._create_optimization_pass(params_grads) optimize_ops = self._create_optimization_pass(
params_grads, param_group_idx=param_group_idx
)
else: else:
assert param_group_idx == 0
program = loss.block.program program = loss.block.program
with program_guard(program, startup_program): with program_guard(program, startup_program):
optimize_ops = self.apply_gradients(params_grads) optimize_ops = self.apply_gradients(params_grads)
...@@ -1398,12 +1420,15 @@ class Optimizer(object): ...@@ -1398,12 +1420,15 @@ class Optimizer(object):
params_grads.append((param, grad_var)) params_grads.append((param, grad_var))
self._apply_optimize( self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads loss=None,
startup_program=None,
params_grads=params_grads,
param_group_idx=0,
) )
else: else:
# optimize parameters in groups # optimize parameters in groups
for param_group in self._param_groups: for idx, param_group in enumerate(self._param_groups):
params_grads = defaultdict(lambda: list()) params_grads = defaultdict(lambda: list())
for param in param_group['params']: for param in param_group['params']:
if param.stop_gradient: if param.stop_gradient:
...@@ -1415,7 +1440,10 @@ class Optimizer(object): ...@@ -1415,7 +1440,10 @@ class Optimizer(object):
{k: v for k, v in param_group.items() if k != 'params'} {k: v for k, v in param_group.items() if k != 'params'}
) )
self._apply_optimize( self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads loss=None,
startup_program=None,
params_grads=params_grads,
param_group_idx=idx,
) )
def _add_param_group(self, param_group): def _add_param_group(self, param_group):
...@@ -1475,7 +1503,7 @@ class Optimizer(object): ...@@ -1475,7 +1503,7 @@ class Optimizer(object):
pass pass
@framework.dygraph_only @framework.dygraph_only
def _multi_tensor_init(self, target_block, parameters): def _multi_tensor_init(self, target_block, parameters, param_group_idx):
""" """
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32). All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
This function will be overridden in the corresponding optimizer file. This function will be overridden in the corresponding optimizer file.
...@@ -1488,7 +1516,7 @@ class Optimizer(object): ...@@ -1488,7 +1516,7 @@ class Optimizer(object):
@framework.dygraph_only @framework.dygraph_only
def _append_optimize_multi_tensor_op( def _append_optimize_multi_tensor_op(
self, target_block, parameters_and_grads self, target_block, parameters_and_grads, param_group_idx
): ):
""" """
For Multi Tensor, append optimize merged_operator to block. For Multi Tensor, append optimize merged_operator to block.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册