未验证 提交 6c9fa665 编写于 作者: W wanghuancoder 提交者: GitHub

delete legacy dygraph code in python/paddle/optimizer (#49308)

上级 983ae1d7
...@@ -170,29 +170,29 @@ class Adadelta(Optimizer): ...@@ -170,29 +170,29 @@ class Adadelta(Optimizer):
self._epsilon, self._epsilon,
) )
return None return None
else:
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.") raise TypeError("block is not instance of framework.Block.")
# Create the adadelta optimizer op # Create the adadelta optimizer op
adadelta_op = block.append_op( adadelta_op = block.append_op(
type=self.type, type=self.type,
inputs={ inputs={
"Param": param_and_grad[0], "Param": param_and_grad[0],
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"AvgSquaredGrad": avg_squared_grad_acc, "AvgSquaredGrad": avg_squared_grad_acc,
"AvgSquaredUpdate": avg_squared_update_acc, "AvgSquaredUpdate": avg_squared_update_acc,
}, },
outputs={ outputs={
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
"AvgSquaredGradOut": avg_squared_grad_acc, "AvgSquaredGradOut": avg_squared_grad_acc,
"AvgSquaredUpdateOut": avg_squared_update_acc, "AvgSquaredUpdateOut": avg_squared_update_acc,
}, },
attrs={"epsilon": self._epsilon, "rho": self._rho}, attrs={"epsilon": self._epsilon, "rho": self._rho},
stop_gradient=True, stop_gradient=True,
) )
return adadelta_op return adadelta_op
def _update_param_group(self, parameters): def _update_param_group(self, parameters):
self._epsilon = parameters.get('epsilon', self._default_dict['epsilon']) self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
......
...@@ -16,7 +16,7 @@ import warnings ...@@ -16,7 +16,7 @@ import warnings
from collections import defaultdict from collections import defaultdict
import paddle import paddle
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops
from ..fluid import core, framework, unique_name from ..fluid import core, framework, unique_name
from ..fluid.dygraph import base as imperative_base from ..fluid.dygraph import base as imperative_base
...@@ -393,98 +393,55 @@ class Adam(Optimizer): ...@@ -393,98 +393,55 @@ class Adam(Optimizer):
) )
return None return None
if framework._in_legacy_dygraph():
_beta1 = (
self._beta1
if not isinstance(self._beta1, Variable)
else self._beta1.numpy().item(0)
)
_beta2 = (
self._beta2
if not isinstance(self._beta2, Variable)
else self._beta2.numpy().item(0)
)
_, _, _, _, _, _ = _legacy_C_ops.adam(
param_and_grad[0],
param_and_grad[1],
lr,
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
param_and_grad[0],
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
'epsilon',
self._epsilon,
'lazy_mode',
self._lazy_mode,
'min_row_size_to_use_multithread',
1000,
'beta1',
_beta1,
'beta2',
_beta2,
'multi_precision',
find_master,
)
return None
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"LearningRate": [lr],
"Moment1": [moment1],
"Moment2": [moment2],
"Beta1Pow": [beta1_pow_acc],
"Beta2Pow": [beta2_pow_acc],
}
outputs = {
"ParamOut": [param_and_grad[0]],
"Moment1Out": [moment1],
"Moment2Out": [moment2],
"Beta1PowOut": [beta1_pow_acc],
"Beta2PowOut": [beta2_pow_acc],
}
attrs = {
"lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000,
"multi_precision": find_master,
}
if isinstance(self._beta1, Variable):
inputs['Beta1Tensor'] = self._beta1
else: else:
attrs['beta1'] = self._beta1 inputs = {
if isinstance(self._beta2, Variable): "Param": [param_and_grad[0]],
inputs['Beta2Tensor'] = self._beta2 "Grad": [param_and_grad[1]],
else: "LearningRate": [lr],
attrs['beta2'] = self._beta2 "Moment1": [moment1],
if isinstance(self._epsilon, Variable): "Moment2": [moment2],
inputs['EpsilonTensor'] = self._epsilon "Beta1Pow": [beta1_pow_acc],
else: "Beta2Pow": [beta2_pow_acc],
attrs['epsilon'] = self._epsilon }
outputs = {
if find_master: "ParamOut": [param_and_grad[0]],
inputs["MasterParam"] = master_weight "Moment1Out": [moment1],
outputs["MasterParamOut"] = master_weight "Moment2Out": [moment2],
"Beta1PowOut": [beta1_pow_acc],
adam_op = block.append_op( "Beta2PowOut": [beta2_pow_acc],
type=self.type, }
inputs=inputs, attrs = {
outputs=outputs, "lazy_mode": self._lazy_mode,
attrs=attrs, "min_row_size_to_use_multithread": 1000,
stop_gradient=True, "multi_precision": find_master,
) }
if isinstance(self._beta1, Variable):
inputs['Beta1Tensor'] = self._beta1
else:
attrs['beta1'] = self._beta1
if isinstance(self._beta2, Variable):
inputs['Beta2Tensor'] = self._beta2
else:
attrs['beta2'] = self._beta2
if isinstance(self._epsilon, Variable):
inputs['EpsilonTensor'] = self._epsilon
else:
attrs['epsilon'] = self._epsilon
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
adam_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return adam_op return adam_op
@imperative_base.no_grad @imperative_base.no_grad
@framework.dygraph_only @framework.dygraph_only
...@@ -729,55 +686,28 @@ class Adam(Optimizer): ...@@ -729,55 +686,28 @@ class Adam(Optimizer):
else self._beta2.numpy().item(0) else self._beta2.numpy().item(0)
) )
if framework._non_static_mode(): if framework.in_dygraph_mode():
master_weight = self._master_weight_dict[key] master_weight = self._master_weight_dict[key]
master_weight = ( master_weight = (
master_weight[param_group_idx] master_weight[param_group_idx]
if master_weight is not None if master_weight is not None
else None else None
) )
if in_dygraph_mode(): _, _, _, _, _, _ = _C_ops.merged_adam_(
self._param_dict[key][param_group_idx],
_, _, _, _, _, _ = _C_ops.merged_adam_( grad_dict[key],
self._param_dict[key][param_group_idx], lr_dict[key],
grad_dict[key], self._moment1_dict[key][param_group_idx],
lr_dict[key], self._moment2_dict[key][param_group_idx],
self._moment1_dict[key][param_group_idx], self._beta1_pow_acc_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx], self._beta2_pow_acc_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx], master_weight,
self._beta2_pow_acc_dict[key][param_group_idx], _beta1,
master_weight, _beta2,
_beta1, self._epsilon,
_beta2, find_master,
self._epsilon, False,
find_master, )
False,
)
else:
_, _, _, _, _, _ = _legacy_C_ops.merged_adam(
self._param_dict[key][param_group_idx],
grad_dict[key],
lr_dict[key],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
self._param_dict[key][param_group_idx],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
'epsilon',
self._epsilon,
'beta1',
_beta1,
'beta2',
_beta2,
'multi_precision',
find_master,
)
else: else:
inputs = { inputs = {
"Param": self._param_dict[key][param_group_idx], "Param": self._param_dict[key][param_group_idx],
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops
from ..fluid import framework from ..fluid import framework
from ..fluid.dygraph import no_grad from ..fluid.dygraph import no_grad
...@@ -210,24 +210,6 @@ class Adamax(Optimizer): ...@@ -210,24 +210,6 @@ class Adamax(Optimizer):
self._beta2, self._beta2,
self._epsilon, self._epsilon,
) )
elif framework._in_legacy_dygraph():
_legacy_C_ops.adamax(
param_and_grad[0],
param_and_grad[1],
self._create_param_lr(param_and_grad),
moment,
inf_norm,
beta1_pow_acc,
param_and_grad[0],
moment,
inf_norm,
"beta1",
self._beta1,
"beta2",
self._beta2,
"epsilon",
self._epsilon,
)
else: else:
# create the adamax optimize op # create the adamax optimize op
adamax_op = block.append_op( adamax_op = block.append_op(
...@@ -271,20 +253,20 @@ class Adamax(Optimizer): ...@@ -271,20 +253,20 @@ class Adamax(Optimizer):
beta1_pow_acc, self._beta1, 0.0, True beta1_pow_acc, self._beta1, 0.0, True
) )
beta1_pow_acc.copy_(tmp, False) beta1_pow_acc.copy_(tmp, False)
continue else:
with param.block.program._optimized_guard( with param.block.program._optimized_guard(
[param, grad] [param, grad]
), name_scope('adamax'): ), name_scope('adamax'):
beta1_pow_acc = self._get_accumulator( beta1_pow_acc = self._get_accumulator(
self._beta1_pow_acc_str, param self._beta1_pow_acc_str, param
) )
block.append_op( block.append_op(
type="scale", type="scale",
inputs={"X": beta1_pow_acc}, inputs={"X": beta1_pow_acc},
outputs={"Out": beta1_pow_acc}, outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1}, attrs={"scale": self._beta1},
stop_gradient=True, stop_gradient=True,
) )
else: else:
for param, grad in parameters_and_grads['params']: for param, grad in parameters_and_grads['params']:
if grad is None or param.stop_gradient is True: if grad is None or param.stop_gradient is True:
...@@ -301,24 +283,23 @@ class Adamax(Optimizer): ...@@ -301,24 +283,23 @@ class Adamax(Optimizer):
beta1_pow_acc, self._beta1, 0.0, True beta1_pow_acc, self._beta1, 0.0, True
) )
beta1_pow_acc.copy_(tmp, False) beta1_pow_acc.copy_(tmp, False)
continue else:
with param.block.program._optimized_guard(
with param.block.program._optimized_guard( [param, grad]
[param, grad] ), name_scope('adamax'):
), name_scope('adamax'): beta1_pow_acc = self._get_accumulator(
beta1_pow_acc = self._get_accumulator( self._beta1_pow_acc_str, param
self._beta1_pow_acc_str, param )
) self._beta1 = parameters_and_grads.get(
self._beta1 = parameters_and_grads.get( 'beta1', self._default_dict['beta1']
'beta1', self._default_dict['beta1'] )
) block.append_op(
block.append_op( type="scale",
type="scale", inputs={"X": beta1_pow_acc},
inputs={"X": beta1_pow_acc}, outputs={"Out": beta1_pow_acc},
outputs={"Out": beta1_pow_acc}, attrs={"scale": self._beta1},
attrs={"scale": self._beta1}, stop_gradient=True,
stop_gradient=True, )
)
def _update_param_group(self, parameters): def _update_param_group(self, parameters):
self._beta1 = parameters.get('beta1', self._default_dict['beta1']) self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
......
...@@ -18,7 +18,7 @@ from collections.abc import Callable ...@@ -18,7 +18,7 @@ from collections.abc import Callable
import paddle import paddle
from .. import _C_ops, _legacy_C_ops from .. import _C_ops
from ..fluid import core, framework, unique_name from ..fluid import core, framework, unique_name
from ..fluid.clip import GradientClipBase from ..fluid.clip import GradientClipBase
from ..fluid.dygraph import base as imperative_base from ..fluid.dygraph import base as imperative_base
...@@ -473,7 +473,7 @@ class AdamW(Optimizer): ...@@ -473,7 +473,7 @@ class AdamW(Optimizer):
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
# create the adamw optimize op # create the adamw optimize op
if framework._non_static_mode(): if framework.in_dygraph_mode():
lr_ratio_ = ( lr_ratio_ = (
1.0 1.0
if self._lr_ratio is None if self._lr_ratio is None
...@@ -491,126 +491,90 @@ class AdamW(Optimizer): ...@@ -491,126 +491,90 @@ class AdamW(Optimizer):
else self._beta2.numpy().item(0) else self._beta2.numpy().item(0)
) )
if framework.in_dygraph_mode(): found_inf = self._get_auxiliary_var('found_inf')
found_inf = self._get_auxiliary_var('found_inf') _, _, _, _, _, _ = _C_ops.adamw_(
_, _, _, _, _, _ = _C_ops.adamw_( param_and_grad[0],
param_and_grad[0], param_and_grad[1],
param_and_grad[1], lr,
lr, moment1,
moment1, moment2,
moment2, beta1_pow_acc,
beta1_pow_acc, beta2_pow_acc,
beta2_pow_acc, master_weight,
master_weight, found_inf,
found_inf, _beta1,
_beta1, _beta2,
_beta2, self._epsilon,
self._epsilon, lr_ratio_,
lr_ratio_, self._weight_decay,
self._weight_decay, with_decay,
with_decay, self._lazy_mode,
self._lazy_mode, 1000,
1000, find_master,
find_master, False,
False, )
)
else:
_, _, _, _, _, _ = _legacy_C_ops.adamw(
param_and_grad[0],
param_and_grad[1],
lr,
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
param_and_grad[0],
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
'epsilon',
self._epsilon,
'lazy_mode',
self._lazy_mode,
'min_row_size_to_use_multithread',
1000,
'beta1',
_beta1,
'beta2',
_beta2,
"with_decay",
with_decay,
'coeff',
self._weight_decay,
'multi_precision',
find_master,
'lr_ratio',
lr_ratio_,
)
return None return None
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"LearningRate": [lr],
"Moment1": [moment1],
"Moment2": [moment2],
"Beta1Pow": [beta1_pow_acc],
"Beta2Pow": [beta2_pow_acc],
}
# Pass found_inf to adamw, to skip update for not only param, but also momentum and beta_pow
found_inf = self._get_auxiliary_var('found_inf')
if found_inf:
inputs['SkipUpdate'] = found_inf
outputs = {
"ParamOut": [param_and_grad[0]],
"Moment1Out": [moment1],
"Moment2Out": [moment2],
"Beta1PowOut": [beta1_pow_acc],
"Beta2PowOut": [beta2_pow_acc],
}
attrs = {
"lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000,
"multi_precision": find_master,
"with_decay": with_decay,
"coeff": self._weight_decay,
"lr_ratio": 1.0
if self._lr_ratio is None
else self._lr_ratio(param_and_grad[0]),
}
if isinstance(self._beta1, Variable):
inputs['Beta1Tensor'] = self._beta1
else:
attrs['beta1'] = self._beta1
if isinstance(self._beta2, Variable):
inputs['Beta2Tensor'] = self._beta2
else: else:
attrs['beta2'] = self._beta2 inputs = {
if isinstance(self._epsilon, Variable): "Param": [param_and_grad[0]],
inputs['EpsilonTensor'] = self._epsilon "Grad": [param_and_grad[1]],
else: "LearningRate": [lr],
attrs['epsilon'] = self._epsilon "Moment1": [moment1],
"Moment2": [moment2],
if find_master: "Beta1Pow": [beta1_pow_acc],
inputs["MasterParam"] = master_weight "Beta2Pow": [beta2_pow_acc],
outputs["MasterParamOut"] = master_weight }
adamw_op = block.append_op( # Pass found_inf to adamw, to skip update for not only param, but also momentum and beta_pow
type=self.type, found_inf = self._get_auxiliary_var('found_inf')
inputs=inputs,
outputs=outputs, if found_inf:
attrs=attrs, inputs['SkipUpdate'] = found_inf
stop_gradient=True,
) outputs = {
"ParamOut": [param_and_grad[0]],
"Moment1Out": [moment1],
"Moment2Out": [moment2],
"Beta1PowOut": [beta1_pow_acc],
"Beta2PowOut": [beta2_pow_acc],
}
attrs = {
"lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000,
"multi_precision": find_master,
"with_decay": with_decay,
"coeff": self._weight_decay,
"lr_ratio": 1.0
if self._lr_ratio is None
else self._lr_ratio(param_and_grad[0]),
}
if isinstance(self._beta1, Variable):
inputs['Beta1Tensor'] = self._beta1
else:
attrs['beta1'] = self._beta1
if isinstance(self._beta2, Variable):
inputs['Beta2Tensor'] = self._beta2
else:
attrs['beta2'] = self._beta2
if isinstance(self._epsilon, Variable):
inputs['EpsilonTensor'] = self._epsilon
else:
attrs['epsilon'] = self._epsilon
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
adamw_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return adamw_op return adamw_op
def __str__(self): def __str__(self):
return " ".join(["Weight Decay, params:", ",".join(self._params_name)]) return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
import paddle import paddle
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops
from paddle.fluid.executor import global_scope from paddle.fluid.executor import global_scope
from ..fluid import core, framework, unique_name from ..fluid import core, framework, unique_name
...@@ -313,76 +313,48 @@ class Lamb(Optimizer): ...@@ -313,76 +313,48 @@ class Lamb(Optimizer):
find_master, find_master,
) )
return None return None
if framework._non_static_mode(): else:
_legacy_C_ops.lamb( # create the lamb optimize op
param_and_grad[0], inputs = {
param_and_grad[1], "Param": param_and_grad[0],
lr, "Grad": param_and_grad[1],
moment1, "LearningRate": lr,
moment2, "Moment1": moment1,
beta1_pow_acc, "Moment2": moment2,
beta2_pow_acc, "Beta1Pow": beta1_pow_acc,
master_weight, "Beta2Pow": beta2_pow_acc,
param_and_grad[0], }
moment1, outputs = {
moment2, "ParamOut": param_and_grad[0],
beta1_pow_acc, "Moment1Out": moment1,
beta2_pow_acc, "Moment2Out": moment2,
master_weight, "Beta1PowOut": beta1_pow_acc,
'beta1', "Beta2PowOut": beta2_pow_acc,
self._beta1, }
'beta2', attrs = {
self._beta2, "beta1": self._beta1,
'epsilon', "beta2": self._beta2,
self._epsilon, "epsilon": self._epsilon,
'weight_decay', "weight_decay": weight_decay,
weight_decay, "multi_precision": find_master,
'multi_precision', }
find_master,
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
if found_inf:
inputs["SkipUpdate"] = found_inf
lamb_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
) )
return None
# create the lamb optimize op
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": lr,
"Moment1": moment1,
"Moment2": moment2,
"Beta1Pow": beta1_pow_acc,
"Beta2Pow": beta2_pow_acc,
}
outputs = {
"ParamOut": param_and_grad[0],
"Moment1Out": moment1,
"Moment2Out": moment2,
"Beta1PowOut": beta1_pow_acc,
"Beta2PowOut": beta2_pow_acc,
}
attrs = {
"beta1": self._beta1,
"beta2": self._beta2,
"epsilon": self._epsilon,
"weight_decay": weight_decay,
"multi_precision": find_master,
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
if found_inf:
inputs["SkipUpdate"] = found_inf
lamb_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return lamb_op return lamb_op
def _update_param_group(self, parameters): def _update_param_group(self, parameters):
self._beta1 = parameters.get('beta1', self._default_dict['beta1']) self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
......
...@@ -20,8 +20,6 @@ import numpy ...@@ -20,8 +20,6 @@ import numpy
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle import Tensor from paddle import Tensor
from ..fluid.framework import _in_legacy_dygraph
__all__ = [ # noqa __all__ = [ # noqa
'LRScheduler', 'LRScheduler',
'NoamDecay', 'NoamDecay',
...@@ -1395,15 +1393,8 @@ class ReduceOnPlateau(LRScheduler): ...@@ -1395,15 +1393,8 @@ class ReduceOnPlateau(LRScheduler):
else: else:
self.last_epoch = epoch self.last_epoch = epoch
if not _in_legacy_dygraph():
tmp = core.eager.Tensor
else:
# need to declarate explicitly
from paddle.framework import VarBase as Tensor
tmp = Tensor
# loss must be float, numpy.ndarray or 1-D Tensor with shape [1] # loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
if isinstance(metrics, (tmp, numpy.ndarray)): if isinstance(metrics, (core.eager.Tensor, numpy.ndarray)):
assert len(metrics.shape) == 1 and metrics.shape[0] == 1, ( assert len(metrics.shape) == 1 and metrics.shape[0] == 1, (
"the metrics.shape " "the metrics.shape "
"should be (1L,), but the current metrics.shape is {}. Maybe that " "should be (1L,), but the current metrics.shape is {}. Maybe that "
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
import warnings import warnings
import paddle import paddle
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops
from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.regularizer import L2DecayRegularizer from paddle.fluid.regularizer import L2DecayRegularizer
from ..fluid import core, framework, unique_name from ..fluid import core, framework, unique_name
...@@ -333,30 +333,6 @@ class Momentum(Optimizer): ...@@ -333,30 +333,6 @@ class Momentum(Optimizer):
else None else None
) )
if _in_legacy_dygraph():
if isinstance(param_and_grad, dict):
self._update_regularization(param_and_grad['weight_decay'])
_, _, _ = _legacy_C_ops.momentum(
param_and_grad[0],
param_and_grad[1],
velocity_acc,
lr,
master_weight,
param_and_grad[0],
velocity_acc,
master_weight,
'mu',
self._momentum,
'use_nesterov',
self._use_nesterov,
'regularization_method',
regularization_method,
'regularization_coeff',
regularization_coeff,
'multi_precision',
find_master,
)
return None
if in_dygraph_mode(): if in_dygraph_mode():
if isinstance(param_and_grad, dict): if isinstance(param_and_grad, dict):
self._update_regularization(param_and_grad['weight_decay']) self._update_regularization(param_and_grad['weight_decay'])
...@@ -373,42 +349,42 @@ class Momentum(Optimizer): ...@@ -373,42 +349,42 @@ class Momentum(Optimizer):
find_master, find_master,
self._rescale_grad, self._rescale_grad,
) )
else:
attrs = {
"mu": self._momentum,
"use_nesterov": self._use_nesterov,
"regularization_method": regularization_method,
"regularization_coeff": regularization_coeff,
"multi_precision": find_master,
"rescale_grad": self._rescale_grad,
}
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"Velocity": [velocity_acc],
"LearningRate": [lr],
}
outputs = {
"ParamOut": [param_and_grad[0]],
"VelocityOut": [velocity_acc],
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
# create the momentum optimize op
momentum_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
attrs = { return momentum_op
"mu": self._momentum,
"use_nesterov": self._use_nesterov,
"regularization_method": regularization_method,
"regularization_coeff": regularization_coeff,
"multi_precision": find_master,
"rescale_grad": self._rescale_grad,
}
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"Velocity": [velocity_acc],
"LearningRate": [lr],
}
outputs = {
"ParamOut": [param_and_grad[0]],
"VelocityOut": [velocity_acc],
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
# create the momentum optimize op
momentum_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return momentum_op
def _multi_tensor_init(self, target_block, parameters, param_group_idx): def _multi_tensor_init(self, target_block, parameters, param_group_idx):
""" """
...@@ -553,50 +529,20 @@ class Momentum(Optimizer): ...@@ -553,50 +529,20 @@ class Momentum(Optimizer):
else None else None
) )
if framework._non_static_mode(): if in_dygraph_mode():
if in_dygraph_mode(): _, _, _ = _C_ops.merged_momentum_(
_, _, _ = _C_ops.merged_momentum_( self._param_dict[key][param_group_idx],
self._param_dict[key][param_group_idx], grad_dict[key],
grad_dict[key], self._velocity_dict[key][param_group_idx],
self._velocity_dict[key][param_group_idx], lr_dict[key],
lr_dict[key], master_weight,
master_weight, self._momentum,
self._momentum, self._use_nesterov,
self._use_nesterov, self._regularization_method_dict[key][param_group_idx],
self._regularization_method_dict[key][ self._regularization_coeff_dict[key][param_group_idx],
param_group_idx find_master,
], self._rescale_grad,
self._regularization_coeff_dict[key][ )
param_group_idx
],
find_master,
self._rescale_grad,
)
else:
_, _, _ = _legacy_C_ops.merged_momentum(
self._param_dict[key][param_group_idx],
grad_dict[key],
self._velocity_dict[key][param_group_idx],
lr_dict[key],
master_weight,
self._param_dict[key][param_group_idx],
self._velocity_dict[key][param_group_idx],
master_weight,
'mu',
self._momentum,
'use_nesterov',
self._use_nesterov,
'regularization_method',
self._regularization_method_dict[key][
param_group_idx
],
'regularization_coeff',
self._regularization_coeff_dict[key][
param_group_idx
],
'multi_precision',
find_master,
)
else: else:
inputs = { inputs = {
"Param": self._param_dict[key][param_group_idx], "Param": self._param_dict[key][param_group_idx],
......
...@@ -18,13 +18,12 @@ from collections import defaultdict ...@@ -18,13 +18,12 @@ from collections import defaultdict
import numpy as np import numpy as np
import paddle import paddle
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.framework import ( from paddle.fluid.framework import (
Variable, Variable,
_current_expected_place, _current_expected_place,
_in_eager_without_dygraph_check, _in_eager_without_dygraph_check,
_in_legacy_dygraph,
default_main_program, default_main_program,
device_guard, device_guard,
in_dygraph_mode, in_dygraph_mode,
...@@ -534,17 +533,6 @@ class Optimizer: ...@@ -534,17 +533,6 @@ class Optimizer:
current_lr.dtype, current_lr.dtype,
place, place,
) )
elif _in_legacy_dygraph():
_legacy_C_ops.fill_constant(
current_lr,
'value',
float(value),
'dtype',
current_lr.dtype,
'shape',
list(current_lr.shape),
)
else: else:
global_block = framework.default_main_program().global_block() global_block = framework.default_main_program().global_block()
global_block.append_op( global_block.append_op(
...@@ -1042,28 +1030,16 @@ class Optimizer: ...@@ -1042,28 +1030,16 @@ class Optimizer:
if self._dtype is None: if self._dtype is None:
self._dtype = loss.dtype self._dtype = loss.dtype
if framework._non_static_mode(): if framework.in_dygraph_mode():
parameter_list = parameters if parameters else self._parameter_list parameter_list = parameters if parameters else self._parameter_list
if framework.in_dygraph_mode(): # It is very time-consuming to call c++ functions in a loop on the python side.
# It is very time-consuming to call c++ functions in a loop on the python side. # We put this part of the code on the c++ side to improve the speed in eager mode.
# We put this part of the code on the c++ side to improve the speed in eager mode. params_grads = []
params_grads = [] grads = core.eager.get_all_grads(parameter_list)
grads = core.eager.get_all_grads(parameter_list) for index, grad in enumerate(grads):
for index, grad in enumerate(grads): if grad is not None:
if grad is not None: params_grads.append((parameter_list[index], grad))
params_grads.append((parameter_list[index], grad))
else:
# Keep the original code to support legacy mode.
# Delete the else branch when the legacy mode exits.
params_grads = []
for param in parameter_list:
if param.stop_gradient:
continue
if param._grad_ivar() is not None:
# create gradient tensor
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
else: else:
if callbacks is None: if callbacks is None:
callbacks = [error_clip_callback] callbacks = [error_clip_callback]
...@@ -1207,28 +1183,26 @@ class Optimizer: ...@@ -1207,28 +1183,26 @@ class Optimizer:
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
return _C_ops.add_n([grad, regularization_term]) return _C_ops.add_n([grad, regularization_term])
elif framework._in_legacy_dygraph(): else:
return _legacy_C_ops.sum([grad, regularization_term]) new_grad = grad
if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
new_grad = grad # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
if grad.type == core.VarDesc.VarType.SELECTED_ROWS: # the grad's type and name will be changed. But the gradient's name
# FIXME(zcd): If the grad is SELECTED_ROWS, after regularization, # is used in ParallelExecutor Reduce mode, so I add a flag for
# the grad's type and name will be changed. But the gradient's name # the new_grad here.
# is used in ParallelExecutor Reduce mode, so I add a flag for new_grad = grad.block.create_var(
# the new_grad here. name=grad.name + core.kNewGradSuffix(),
new_grad = grad.block.create_var( dtype=param.dtype,
name=grad.name + core.kNewGradSuffix(), shape=param.shape,
dtype=param.dtype, lod_level=param.lod_level,
shape=param.shape, type=core.VarDesc.VarType.LOD_TENSOR,
lod_level=param.lod_level, )
type=core.VarDesc.VarType.LOD_TENSOR,
)
inputs = {"X": [grad, regularization_term]} inputs = {"X": [grad, regularization_term]}
outputs = {"Out": [new_grad]} outputs = {"Out": [new_grad]}
grad.block.append_op(type='sum', inputs=inputs, outputs=outputs) grad.block.append_op(type='sum', inputs=inputs, outputs=outputs)
return new_grad return new_grad
def append_regularization_ops( def append_regularization_ops(
self, parameters_and_grads, regularization=None self, parameters_and_grads, regularization=None
......
...@@ -15,11 +15,11 @@ ...@@ -15,11 +15,11 @@
import warnings import warnings
import paddle import paddle
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops
from ..fluid import core, framework, unique_name from ..fluid import core, framework, unique_name
from ..fluid.dygraph import no_grad from ..fluid.dygraph import no_grad
from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode from ..fluid.framework import in_dygraph_mode
from ..fluid.layer_helper import LayerHelper from ..fluid.layer_helper import LayerHelper
from .optimizer import Optimizer from .optimizer import Optimizer
...@@ -166,42 +166,32 @@ class SGD(Optimizer): ...@@ -166,42 +166,32 @@ class SGD(Optimizer):
find_master, find_master,
) )
return None return None
if _in_legacy_dygraph(): else:
_legacy_C_ops.sgd( assert isinstance(block, framework.Block)
param_and_grad[0], # create the optimize op
lr, inputs = {
param_and_grad[1], "Param": param_and_grad[0],
master_weight, "Grad": param_and_grad[1],
param_and_grad[0], "LearningRate": lr,
master_weight, }
outputs = {"ParamOut": param_and_grad[0]}
attrs = {"multi_precision": find_master}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
sgd_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
) )
return None
assert isinstance(block, framework.Block)
# create the optimize op
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": lr,
}
outputs = {"ParamOut": param_and_grad[0]}
attrs = {"multi_precision": find_master}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
sgd_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return sgd_op return sgd_op
def _update_param_group(self, parameters): def _update_param_group(self, parameters):
parameters = parameters.get('params') parameters = parameters.get('params')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册