未验证 提交 6c9fa665 编写于 作者: W wanghuancoder 提交者: GitHub

delete legacy dygraph code in python/paddle/optimizer (#49308)

上级 983ae1d7
......@@ -170,29 +170,29 @@ class Adadelta(Optimizer):
self._epsilon,
)
return None
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
# Create the adadelta optimizer op
adadelta_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"AvgSquaredGrad": avg_squared_grad_acc,
"AvgSquaredUpdate": avg_squared_update_acc,
},
outputs={
"ParamOut": param_and_grad[0],
"AvgSquaredGradOut": avg_squared_grad_acc,
"AvgSquaredUpdateOut": avg_squared_update_acc,
},
attrs={"epsilon": self._epsilon, "rho": self._rho},
stop_gradient=True,
)
return adadelta_op
else:
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
# Create the adadelta optimizer op
adadelta_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"AvgSquaredGrad": avg_squared_grad_acc,
"AvgSquaredUpdate": avg_squared_update_acc,
},
outputs={
"ParamOut": param_and_grad[0],
"AvgSquaredGradOut": avg_squared_grad_acc,
"AvgSquaredUpdateOut": avg_squared_update_acc,
},
attrs={"epsilon": self._epsilon, "rho": self._rho},
stop_gradient=True,
)
return adadelta_op
def _update_param_group(self, parameters):
self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
......
......@@ -16,7 +16,7 @@ import warnings
from collections import defaultdict
import paddle
from paddle import _C_ops, _legacy_C_ops
from paddle import _C_ops
from ..fluid import core, framework, unique_name
from ..fluid.dygraph import base as imperative_base
......@@ -393,98 +393,55 @@ class Adam(Optimizer):
)
return None
if framework._in_legacy_dygraph():
_beta1 = (
self._beta1
if not isinstance(self._beta1, Variable)
else self._beta1.numpy().item(0)
)
_beta2 = (
self._beta2
if not isinstance(self._beta2, Variable)
else self._beta2.numpy().item(0)
)
_, _, _, _, _, _ = _legacy_C_ops.adam(
param_and_grad[0],
param_and_grad[1],
lr,
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
param_and_grad[0],
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
'epsilon',
self._epsilon,
'lazy_mode',
self._lazy_mode,
'min_row_size_to_use_multithread',
1000,
'beta1',
_beta1,
'beta2',
_beta2,
'multi_precision',
find_master,
)
return None
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"LearningRate": [lr],
"Moment1": [moment1],
"Moment2": [moment2],
"Beta1Pow": [beta1_pow_acc],
"Beta2Pow": [beta2_pow_acc],
}
outputs = {
"ParamOut": [param_and_grad[0]],
"Moment1Out": [moment1],
"Moment2Out": [moment2],
"Beta1PowOut": [beta1_pow_acc],
"Beta2PowOut": [beta2_pow_acc],
}
attrs = {
"lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000,
"multi_precision": find_master,
}
if isinstance(self._beta1, Variable):
inputs['Beta1Tensor'] = self._beta1
else:
attrs['beta1'] = self._beta1
if isinstance(self._beta2, Variable):
inputs['Beta2Tensor'] = self._beta2
else:
attrs['beta2'] = self._beta2
if isinstance(self._epsilon, Variable):
inputs['EpsilonTensor'] = self._epsilon
else:
attrs['epsilon'] = self._epsilon
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
adam_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"LearningRate": [lr],
"Moment1": [moment1],
"Moment2": [moment2],
"Beta1Pow": [beta1_pow_acc],
"Beta2Pow": [beta2_pow_acc],
}
outputs = {
"ParamOut": [param_and_grad[0]],
"Moment1Out": [moment1],
"Moment2Out": [moment2],
"Beta1PowOut": [beta1_pow_acc],
"Beta2PowOut": [beta2_pow_acc],
}
attrs = {
"lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000,
"multi_precision": find_master,
}
if isinstance(self._beta1, Variable):
inputs['Beta1Tensor'] = self._beta1
else:
attrs['beta1'] = self._beta1
if isinstance(self._beta2, Variable):
inputs['Beta2Tensor'] = self._beta2
else:
attrs['beta2'] = self._beta2
if isinstance(self._epsilon, Variable):
inputs['EpsilonTensor'] = self._epsilon
else:
attrs['epsilon'] = self._epsilon
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
adam_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return adam_op
return adam_op
@imperative_base.no_grad
@framework.dygraph_only
......@@ -729,55 +686,28 @@ class Adam(Optimizer):
else self._beta2.numpy().item(0)
)
if framework._non_static_mode():
if framework.in_dygraph_mode():
master_weight = self._master_weight_dict[key]
master_weight = (
master_weight[param_group_idx]
if master_weight is not None
else None
)
if in_dygraph_mode():
_, _, _, _, _, _ = _C_ops.merged_adam_(
self._param_dict[key][param_group_idx],
grad_dict[key],
lr_dict[key],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
_beta1,
_beta2,
self._epsilon,
find_master,
False,
)
else:
_, _, _, _, _, _ = _legacy_C_ops.merged_adam(
self._param_dict[key][param_group_idx],
grad_dict[key],
lr_dict[key],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
self._param_dict[key][param_group_idx],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
'epsilon',
self._epsilon,
'beta1',
_beta1,
'beta2',
_beta2,
'multi_precision',
find_master,
)
_, _, _, _, _, _ = _C_ops.merged_adam_(
self._param_dict[key][param_group_idx],
grad_dict[key],
lr_dict[key],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
_beta1,
_beta2,
self._epsilon,
find_master,
False,
)
else:
inputs = {
"Param": self._param_dict[key][param_group_idx],
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle import _C_ops, _legacy_C_ops
from paddle import _C_ops
from ..fluid import framework
from ..fluid.dygraph import no_grad
......@@ -210,24 +210,6 @@ class Adamax(Optimizer):
self._beta2,
self._epsilon,
)
elif framework._in_legacy_dygraph():
_legacy_C_ops.adamax(
param_and_grad[0],
param_and_grad[1],
self._create_param_lr(param_and_grad),
moment,
inf_norm,
beta1_pow_acc,
param_and_grad[0],
moment,
inf_norm,
"beta1",
self._beta1,
"beta2",
self._beta2,
"epsilon",
self._epsilon,
)
else:
# create the adamax optimize op
adamax_op = block.append_op(
......@@ -271,20 +253,20 @@ class Adamax(Optimizer):
beta1_pow_acc, self._beta1, 0.0, True
)
beta1_pow_acc.copy_(tmp, False)
continue
with param.block.program._optimized_guard(
[param, grad]
), name_scope('adamax'):
beta1_pow_acc = self._get_accumulator(
self._beta1_pow_acc_str, param
)
block.append_op(
type="scale",
inputs={"X": beta1_pow_acc},
outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1},
stop_gradient=True,
)
else:
with param.block.program._optimized_guard(
[param, grad]
), name_scope('adamax'):
beta1_pow_acc = self._get_accumulator(
self._beta1_pow_acc_str, param
)
block.append_op(
type="scale",
inputs={"X": beta1_pow_acc},
outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1},
stop_gradient=True,
)
else:
for param, grad in parameters_and_grads['params']:
if grad is None or param.stop_gradient is True:
......@@ -301,24 +283,23 @@ class Adamax(Optimizer):
beta1_pow_acc, self._beta1, 0.0, True
)
beta1_pow_acc.copy_(tmp, False)
continue
with param.block.program._optimized_guard(
[param, grad]
), name_scope('adamax'):
beta1_pow_acc = self._get_accumulator(
self._beta1_pow_acc_str, param
)
self._beta1 = parameters_and_grads.get(
'beta1', self._default_dict['beta1']
)
block.append_op(
type="scale",
inputs={"X": beta1_pow_acc},
outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1},
stop_gradient=True,
)
else:
with param.block.program._optimized_guard(
[param, grad]
), name_scope('adamax'):
beta1_pow_acc = self._get_accumulator(
self._beta1_pow_acc_str, param
)
self._beta1 = parameters_and_grads.get(
'beta1', self._default_dict['beta1']
)
block.append_op(
type="scale",
inputs={"X": beta1_pow_acc},
outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1},
stop_gradient=True,
)
def _update_param_group(self, parameters):
self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
......
......@@ -18,7 +18,7 @@ from collections.abc import Callable
import paddle
from .. import _C_ops, _legacy_C_ops
from .. import _C_ops
from ..fluid import core, framework, unique_name
from ..fluid.clip import GradientClipBase
from ..fluid.dygraph import base as imperative_base
......@@ -473,7 +473,7 @@ class AdamW(Optimizer):
lr = self._create_param_lr(param_and_grad)
# create the adamw optimize op
if framework._non_static_mode():
if framework.in_dygraph_mode():
lr_ratio_ = (
1.0
if self._lr_ratio is None
......@@ -491,126 +491,90 @@ class AdamW(Optimizer):
else self._beta2.numpy().item(0)
)
if framework.in_dygraph_mode():
found_inf = self._get_auxiliary_var('found_inf')
_, _, _, _, _, _ = _C_ops.adamw_(
param_and_grad[0],
param_and_grad[1],
lr,
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
found_inf,
_beta1,
_beta2,
self._epsilon,
lr_ratio_,
self._weight_decay,
with_decay,
self._lazy_mode,
1000,
find_master,
False,
)
else:
_, _, _, _, _, _ = _legacy_C_ops.adamw(
param_and_grad[0],
param_and_grad[1],
lr,
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
param_and_grad[0],
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
'epsilon',
self._epsilon,
'lazy_mode',
self._lazy_mode,
'min_row_size_to_use_multithread',
1000,
'beta1',
_beta1,
'beta2',
_beta2,
"with_decay",
with_decay,
'coeff',
self._weight_decay,
'multi_precision',
find_master,
'lr_ratio',
lr_ratio_,
)
found_inf = self._get_auxiliary_var('found_inf')
_, _, _, _, _, _ = _C_ops.adamw_(
param_and_grad[0],
param_and_grad[1],
lr,
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
found_inf,
_beta1,
_beta2,
self._epsilon,
lr_ratio_,
self._weight_decay,
with_decay,
self._lazy_mode,
1000,
find_master,
False,
)
return None
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"LearningRate": [lr],
"Moment1": [moment1],
"Moment2": [moment2],
"Beta1Pow": [beta1_pow_acc],
"Beta2Pow": [beta2_pow_acc],
}
# Pass found_inf to adamw, to skip update for not only param, but also momentum and beta_pow
found_inf = self._get_auxiliary_var('found_inf')
if found_inf:
inputs['SkipUpdate'] = found_inf
outputs = {
"ParamOut": [param_and_grad[0]],
"Moment1Out": [moment1],
"Moment2Out": [moment2],
"Beta1PowOut": [beta1_pow_acc],
"Beta2PowOut": [beta2_pow_acc],
}
attrs = {
"lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000,
"multi_precision": find_master,
"with_decay": with_decay,
"coeff": self._weight_decay,
"lr_ratio": 1.0
if self._lr_ratio is None
else self._lr_ratio(param_and_grad[0]),
}
if isinstance(self._beta1, Variable):
inputs['Beta1Tensor'] = self._beta1
else:
attrs['beta1'] = self._beta1
if isinstance(self._beta2, Variable):
inputs['Beta2Tensor'] = self._beta2
else:
attrs['beta2'] = self._beta2
if isinstance(self._epsilon, Variable):
inputs['EpsilonTensor'] = self._epsilon
else:
attrs['epsilon'] = self._epsilon
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
adamw_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"LearningRate": [lr],
"Moment1": [moment1],
"Moment2": [moment2],
"Beta1Pow": [beta1_pow_acc],
"Beta2Pow": [beta2_pow_acc],
}
# Pass found_inf to adamw, to skip update for not only param, but also momentum and beta_pow
found_inf = self._get_auxiliary_var('found_inf')
if found_inf:
inputs['SkipUpdate'] = found_inf
outputs = {
"ParamOut": [param_and_grad[0]],
"Moment1Out": [moment1],
"Moment2Out": [moment2],
"Beta1PowOut": [beta1_pow_acc],
"Beta2PowOut": [beta2_pow_acc],
}
attrs = {
"lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000,
"multi_precision": find_master,
"with_decay": with_decay,
"coeff": self._weight_decay,
"lr_ratio": 1.0
if self._lr_ratio is None
else self._lr_ratio(param_and_grad[0]),
}
if isinstance(self._beta1, Variable):
inputs['Beta1Tensor'] = self._beta1
else:
attrs['beta1'] = self._beta1
if isinstance(self._beta2, Variable):
inputs['Beta2Tensor'] = self._beta2
else:
attrs['beta2'] = self._beta2
if isinstance(self._epsilon, Variable):
inputs['EpsilonTensor'] = self._epsilon
else:
attrs['epsilon'] = self._epsilon
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
adamw_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return adamw_op
return adamw_op
def __str__(self):
return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
......
......@@ -13,7 +13,7 @@
# limitations under the License.
import paddle
from paddle import _C_ops, _legacy_C_ops
from paddle import _C_ops
from paddle.fluid.executor import global_scope
from ..fluid import core, framework, unique_name
......@@ -313,76 +313,48 @@ class Lamb(Optimizer):
find_master,
)
return None
if framework._non_static_mode():
_legacy_C_ops.lamb(
param_and_grad[0],
param_and_grad[1],
lr,
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
param_and_grad[0],
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
'beta1',
self._beta1,
'beta2',
self._beta2,
'epsilon',
self._epsilon,
'weight_decay',
weight_decay,
'multi_precision',
find_master,
else:
# create the lamb optimize op
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": lr,
"Moment1": moment1,
"Moment2": moment2,
"Beta1Pow": beta1_pow_acc,
"Beta2Pow": beta2_pow_acc,
}
outputs = {
"ParamOut": param_and_grad[0],
"Moment1Out": moment1,
"Moment2Out": moment2,
"Beta1PowOut": beta1_pow_acc,
"Beta2PowOut": beta2_pow_acc,
}
attrs = {
"beta1": self._beta1,
"beta2": self._beta2,
"epsilon": self._epsilon,
"weight_decay": weight_decay,
"multi_precision": find_master,
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
if found_inf:
inputs["SkipUpdate"] = found_inf
lamb_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return None
# create the lamb optimize op
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": lr,
"Moment1": moment1,
"Moment2": moment2,
"Beta1Pow": beta1_pow_acc,
"Beta2Pow": beta2_pow_acc,
}
outputs = {
"ParamOut": param_and_grad[0],
"Moment1Out": moment1,
"Moment2Out": moment2,
"Beta1PowOut": beta1_pow_acc,
"Beta2PowOut": beta2_pow_acc,
}
attrs = {
"beta1": self._beta1,
"beta2": self._beta2,
"epsilon": self._epsilon,
"weight_decay": weight_decay,
"multi_precision": find_master,
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
if found_inf:
inputs["SkipUpdate"] = found_inf
lamb_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return lamb_op
return lamb_op
def _update_param_group(self, parameters):
self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
......
......@@ -20,8 +20,6 @@ import numpy
import paddle.fluid.core as core
from paddle import Tensor
from ..fluid.framework import _in_legacy_dygraph
__all__ = [ # noqa
'LRScheduler',
'NoamDecay',
......@@ -1395,15 +1393,8 @@ class ReduceOnPlateau(LRScheduler):
else:
self.last_epoch = epoch
if not _in_legacy_dygraph():
tmp = core.eager.Tensor
else:
# need to declarate explicitly
from paddle.framework import VarBase as Tensor
tmp = Tensor
# loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
if isinstance(metrics, (tmp, numpy.ndarray)):
if isinstance(metrics, (core.eager.Tensor, numpy.ndarray)):
assert len(metrics.shape) == 1 and metrics.shape[0] == 1, (
"the metrics.shape "
"should be (1L,), but the current metrics.shape is {}. Maybe that "
......
......@@ -15,8 +15,8 @@
import warnings
import paddle
from paddle import _C_ops, _legacy_C_ops
from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
from paddle import _C_ops
from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.regularizer import L2DecayRegularizer
from ..fluid import core, framework, unique_name
......@@ -333,30 +333,6 @@ class Momentum(Optimizer):
else None
)
if _in_legacy_dygraph():
if isinstance(param_and_grad, dict):
self._update_regularization(param_and_grad['weight_decay'])
_, _, _ = _legacy_C_ops.momentum(
param_and_grad[0],
param_and_grad[1],
velocity_acc,
lr,
master_weight,
param_and_grad[0],
velocity_acc,
master_weight,
'mu',
self._momentum,
'use_nesterov',
self._use_nesterov,
'regularization_method',
regularization_method,
'regularization_coeff',
regularization_coeff,
'multi_precision',
find_master,
)
return None
if in_dygraph_mode():
if isinstance(param_and_grad, dict):
self._update_regularization(param_and_grad['weight_decay'])
......@@ -373,42 +349,42 @@ class Momentum(Optimizer):
find_master,
self._rescale_grad,
)
else:
attrs = {
"mu": self._momentum,
"use_nesterov": self._use_nesterov,
"regularization_method": regularization_method,
"regularization_coeff": regularization_coeff,
"multi_precision": find_master,
"rescale_grad": self._rescale_grad,
}
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"Velocity": [velocity_acc],
"LearningRate": [lr],
}
outputs = {
"ParamOut": [param_and_grad[0]],
"VelocityOut": [velocity_acc],
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
# create the momentum optimize op
momentum_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
attrs = {
"mu": self._momentum,
"use_nesterov": self._use_nesterov,
"regularization_method": regularization_method,
"regularization_coeff": regularization_coeff,
"multi_precision": find_master,
"rescale_grad": self._rescale_grad,
}
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"Velocity": [velocity_acc],
"LearningRate": [lr],
}
outputs = {
"ParamOut": [param_and_grad[0]],
"VelocityOut": [velocity_acc],
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
# create the momentum optimize op
momentum_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return momentum_op
return momentum_op
def _multi_tensor_init(self, target_block, parameters, param_group_idx):
"""
......@@ -553,50 +529,20 @@ class Momentum(Optimizer):
else None
)
if framework._non_static_mode():
if in_dygraph_mode():
_, _, _ = _C_ops.merged_momentum_(
self._param_dict[key][param_group_idx],
grad_dict[key],
self._velocity_dict[key][param_group_idx],
lr_dict[key],
master_weight,
self._momentum,
self._use_nesterov,
self._regularization_method_dict[key][
param_group_idx
],
self._regularization_coeff_dict[key][
param_group_idx
],
find_master,
self._rescale_grad,
)
else:
_, _, _ = _legacy_C_ops.merged_momentum(
self._param_dict[key][param_group_idx],
grad_dict[key],
self._velocity_dict[key][param_group_idx],
lr_dict[key],
master_weight,
self._param_dict[key][param_group_idx],
self._velocity_dict[key][param_group_idx],
master_weight,
'mu',
self._momentum,
'use_nesterov',
self._use_nesterov,
'regularization_method',
self._regularization_method_dict[key][
param_group_idx
],
'regularization_coeff',
self._regularization_coeff_dict[key][
param_group_idx
],
'multi_precision',
find_master,
)
if in_dygraph_mode():
_, _, _ = _C_ops.merged_momentum_(
self._param_dict[key][param_group_idx],
grad_dict[key],
self._velocity_dict[key][param_group_idx],
lr_dict[key],
master_weight,
self._momentum,
self._use_nesterov,
self._regularization_method_dict[key][param_group_idx],
self._regularization_coeff_dict[key][param_group_idx],
find_master,
self._rescale_grad,
)
else:
inputs = {
"Param": self._param_dict[key][param_group_idx],
......
......@@ -18,13 +18,12 @@ from collections import defaultdict
import numpy as np
import paddle
from paddle import _C_ops, _legacy_C_ops
from paddle import _C_ops
from paddle.fluid import core
from paddle.fluid.framework import (
Variable,
_current_expected_place,
_in_eager_without_dygraph_check,
_in_legacy_dygraph,
default_main_program,
device_guard,
in_dygraph_mode,
......@@ -534,17 +533,6 @@ class Optimizer:
current_lr.dtype,
place,
)
elif _in_legacy_dygraph():
_legacy_C_ops.fill_constant(
current_lr,
'value',
float(value),
'dtype',
current_lr.dtype,
'shape',
list(current_lr.shape),
)
else:
global_block = framework.default_main_program().global_block()
global_block.append_op(
......@@ -1042,28 +1030,16 @@ class Optimizer:
if self._dtype is None:
self._dtype = loss.dtype
if framework._non_static_mode():
if framework.in_dygraph_mode():
parameter_list = parameters if parameters else self._parameter_list
if framework.in_dygraph_mode():
# It is very time-consuming to call c++ functions in a loop on the python side.
# We put this part of the code on the c++ side to improve the speed in eager mode.
params_grads = []
grads = core.eager.get_all_grads(parameter_list)
for index, grad in enumerate(grads):
if grad is not None:
params_grads.append((parameter_list[index], grad))
else:
# Keep the original code to support legacy mode.
# Delete the else branch when the legacy mode exits.
params_grads = []
for param in parameter_list:
if param.stop_gradient:
continue
if param._grad_ivar() is not None:
# create gradient tensor
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
# It is very time-consuming to call c++ functions in a loop on the python side.
# We put this part of the code on the c++ side to improve the speed in eager mode.
params_grads = []
grads = core.eager.get_all_grads(parameter_list)
for index, grad in enumerate(grads):
if grad is not None:
params_grads.append((parameter_list[index], grad))
else:
if callbacks is None:
callbacks = [error_clip_callback]
......@@ -1207,28 +1183,26 @@ class Optimizer:
if framework.in_dygraph_mode():
return _C_ops.add_n([grad, regularization_term])
elif framework._in_legacy_dygraph():
return _legacy_C_ops.sum([grad, regularization_term])
new_grad = grad
if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
# FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
# the grad's type and name will be changed. But the gradient's name
# is used in ParallelExecutor Reduce mode, so I add a flag for
# the new_grad here.
new_grad = grad.block.create_var(
name=grad.name + core.kNewGradSuffix(),
dtype=param.dtype,
shape=param.shape,
lod_level=param.lod_level,
type=core.VarDesc.VarType.LOD_TENSOR,
)
else:
new_grad = grad
if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
# FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
# the grad's type and name will be changed. But the gradient's name
# is used in ParallelExecutor Reduce mode, so I add a flag for
# the new_grad here.
new_grad = grad.block.create_var(
name=grad.name + core.kNewGradSuffix(),
dtype=param.dtype,
shape=param.shape,
lod_level=param.lod_level,
type=core.VarDesc.VarType.LOD_TENSOR,
)
inputs = {"X": [grad, regularization_term]}
outputs = {"Out": [new_grad]}
grad.block.append_op(type='sum', inputs=inputs, outputs=outputs)
inputs = {"X": [grad, regularization_term]}
outputs = {"Out": [new_grad]}
grad.block.append_op(type='sum', inputs=inputs, outputs=outputs)
return new_grad
return new_grad
def append_regularization_ops(
self, parameters_and_grads, regularization=None
......
......@@ -15,11 +15,11 @@
import warnings
import paddle
from paddle import _C_ops, _legacy_C_ops
from paddle import _C_ops
from ..fluid import core, framework, unique_name
from ..fluid.dygraph import no_grad
from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode
from ..fluid.framework import in_dygraph_mode
from ..fluid.layer_helper import LayerHelper
from .optimizer import Optimizer
......@@ -166,42 +166,32 @@ class SGD(Optimizer):
find_master,
)
return None
if _in_legacy_dygraph():
_legacy_C_ops.sgd(
param_and_grad[0],
lr,
param_and_grad[1],
master_weight,
param_and_grad[0],
master_weight,
else:
assert isinstance(block, framework.Block)
# create the optimize op
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": lr,
}
outputs = {"ParamOut": param_and_grad[0]}
attrs = {"multi_precision": find_master}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
sgd_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return None
assert isinstance(block, framework.Block)
# create the optimize op
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": lr,
}
outputs = {"ParamOut": param_and_grad[0]}
attrs = {"multi_precision": find_master}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
sgd_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return sgd_op
return sgd_op
def _update_param_group(self, parameters):
parameters = parameters.get('params')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册