diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py index 76d73d3f5fdc6fadfd630af8d4a2ca4b9f0df808..d3c16048c9079923f6dbebd2bb87770581a1591e 100644 --- a/python/paddle/optimizer/adadelta.py +++ b/python/paddle/optimizer/adadelta.py @@ -170,29 +170,29 @@ class Adadelta(Optimizer): self._epsilon, ) return None - - if not isinstance(block, framework.Block): - raise TypeError("block is not instance of framework.Block.") - - # Create the adadelta optimizer op - adadelta_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "AvgSquaredGrad": avg_squared_grad_acc, - "AvgSquaredUpdate": avg_squared_update_acc, - }, - outputs={ - "ParamOut": param_and_grad[0], - "AvgSquaredGradOut": avg_squared_grad_acc, - "AvgSquaredUpdateOut": avg_squared_update_acc, - }, - attrs={"epsilon": self._epsilon, "rho": self._rho}, - stop_gradient=True, - ) - - return adadelta_op + else: + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + # Create the adadelta optimizer op + adadelta_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "AvgSquaredGrad": avg_squared_grad_acc, + "AvgSquaredUpdate": avg_squared_update_acc, + }, + outputs={ + "ParamOut": param_and_grad[0], + "AvgSquaredGradOut": avg_squared_grad_acc, + "AvgSquaredUpdateOut": avg_squared_update_acc, + }, + attrs={"epsilon": self._epsilon, "rho": self._rho}, + stop_gradient=True, + ) + + return adadelta_op def _update_param_group(self, parameters): self._epsilon = parameters.get('epsilon', self._default_dict['epsilon']) diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index 1a517b9504baf39f93ea2d69d0615065977cd725..c5bc56769188e2f5878548deb6b72051967185b3 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -16,7 +16,7 @@ import warnings from collections import defaultdict import paddle -from paddle import _C_ops, _legacy_C_ops +from paddle import _C_ops from ..fluid import core, framework, unique_name from ..fluid.dygraph import base as imperative_base @@ -393,98 +393,55 @@ class Adam(Optimizer): ) return None - - if framework._in_legacy_dygraph(): - - _beta1 = ( - self._beta1 - if not isinstance(self._beta1, Variable) - else self._beta1.numpy().item(0) - ) - _beta2 = ( - self._beta2 - if not isinstance(self._beta2, Variable) - else self._beta2.numpy().item(0) - ) - _, _, _, _, _, _ = _legacy_C_ops.adam( - param_and_grad[0], - param_and_grad[1], - lr, - moment1, - moment2, - beta1_pow_acc, - beta2_pow_acc, - master_weight, - param_and_grad[0], - moment1, - moment2, - beta1_pow_acc, - beta2_pow_acc, - master_weight, - 'epsilon', - self._epsilon, - 'lazy_mode', - self._lazy_mode, - 'min_row_size_to_use_multithread', - 1000, - 'beta1', - _beta1, - 'beta2', - _beta2, - 'multi_precision', - find_master, - ) - - return None - - inputs = { - "Param": [param_and_grad[0]], - "Grad": [param_and_grad[1]], - "LearningRate": [lr], - "Moment1": [moment1], - "Moment2": [moment2], - "Beta1Pow": [beta1_pow_acc], - "Beta2Pow": [beta2_pow_acc], - } - outputs = { - "ParamOut": [param_and_grad[0]], - "Moment1Out": [moment1], - "Moment2Out": [moment2], - "Beta1PowOut": [beta1_pow_acc], - "Beta2PowOut": [beta2_pow_acc], - } - attrs = { - "lazy_mode": self._lazy_mode, - "min_row_size_to_use_multithread": 1000, - "multi_precision": find_master, - } - - if isinstance(self._beta1, Variable): - inputs['Beta1Tensor'] = self._beta1 else: - attrs['beta1'] = self._beta1 - if isinstance(self._beta2, Variable): - inputs['Beta2Tensor'] = self._beta2 - else: - attrs['beta2'] = self._beta2 - if isinstance(self._epsilon, Variable): - inputs['EpsilonTensor'] = self._epsilon - else: - attrs['epsilon'] = self._epsilon - - if find_master: - inputs["MasterParam"] = master_weight - outputs["MasterParamOut"] = master_weight - - adam_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) + inputs = { + "Param": [param_and_grad[0]], + "Grad": [param_and_grad[1]], + "LearningRate": [lr], + "Moment1": [moment1], + "Moment2": [moment2], + "Beta1Pow": [beta1_pow_acc], + "Beta2Pow": [beta2_pow_acc], + } + outputs = { + "ParamOut": [param_and_grad[0]], + "Moment1Out": [moment1], + "Moment2Out": [moment2], + "Beta1PowOut": [beta1_pow_acc], + "Beta2PowOut": [beta2_pow_acc], + } + attrs = { + "lazy_mode": self._lazy_mode, + "min_row_size_to_use_multithread": 1000, + "multi_precision": find_master, + } + + if isinstance(self._beta1, Variable): + inputs['Beta1Tensor'] = self._beta1 + else: + attrs['beta1'] = self._beta1 + if isinstance(self._beta2, Variable): + inputs['Beta2Tensor'] = self._beta2 + else: + attrs['beta2'] = self._beta2 + if isinstance(self._epsilon, Variable): + inputs['EpsilonTensor'] = self._epsilon + else: + attrs['epsilon'] = self._epsilon + + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + + adam_op = block.append_op( + type=self.type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True, + ) - return adam_op + return adam_op @imperative_base.no_grad @framework.dygraph_only @@ -729,55 +686,28 @@ class Adam(Optimizer): else self._beta2.numpy().item(0) ) - if framework._non_static_mode(): + if framework.in_dygraph_mode(): master_weight = self._master_weight_dict[key] master_weight = ( master_weight[param_group_idx] if master_weight is not None else None ) - if in_dygraph_mode(): - - _, _, _, _, _, _ = _C_ops.merged_adam_( - self._param_dict[key][param_group_idx], - grad_dict[key], - lr_dict[key], - self._moment1_dict[key][param_group_idx], - self._moment2_dict[key][param_group_idx], - self._beta1_pow_acc_dict[key][param_group_idx], - self._beta2_pow_acc_dict[key][param_group_idx], - master_weight, - _beta1, - _beta2, - self._epsilon, - find_master, - False, - ) - else: - _, _, _, _, _, _ = _legacy_C_ops.merged_adam( - self._param_dict[key][param_group_idx], - grad_dict[key], - lr_dict[key], - self._moment1_dict[key][param_group_idx], - self._moment2_dict[key][param_group_idx], - self._beta1_pow_acc_dict[key][param_group_idx], - self._beta2_pow_acc_dict[key][param_group_idx], - master_weight, - self._param_dict[key][param_group_idx], - self._moment1_dict[key][param_group_idx], - self._moment2_dict[key][param_group_idx], - self._beta1_pow_acc_dict[key][param_group_idx], - self._beta2_pow_acc_dict[key][param_group_idx], - master_weight, - 'epsilon', - self._epsilon, - 'beta1', - _beta1, - 'beta2', - _beta2, - 'multi_precision', - find_master, - ) + _, _, _, _, _, _ = _C_ops.merged_adam_( + self._param_dict[key][param_group_idx], + grad_dict[key], + lr_dict[key], + self._moment1_dict[key][param_group_idx], + self._moment2_dict[key][param_group_idx], + self._beta1_pow_acc_dict[key][param_group_idx], + self._beta2_pow_acc_dict[key][param_group_idx], + master_weight, + _beta1, + _beta2, + self._epsilon, + find_master, + False, + ) else: inputs = { "Param": self._param_dict[key][param_group_idx], diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py index 69a2102ebb2e80d3ddfa2e81103044ab6083a314..f3990f62aff9da477cfe32a0733009b0ff22ae90 100644 --- a/python/paddle/optimizer/adamax.py +++ b/python/paddle/optimizer/adamax.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle import _C_ops, _legacy_C_ops +from paddle import _C_ops from ..fluid import framework from ..fluid.dygraph import no_grad @@ -210,24 +210,6 @@ class Adamax(Optimizer): self._beta2, self._epsilon, ) - elif framework._in_legacy_dygraph(): - _legacy_C_ops.adamax( - param_and_grad[0], - param_and_grad[1], - self._create_param_lr(param_and_grad), - moment, - inf_norm, - beta1_pow_acc, - param_and_grad[0], - moment, - inf_norm, - "beta1", - self._beta1, - "beta2", - self._beta2, - "epsilon", - self._epsilon, - ) else: # create the adamax optimize op adamax_op = block.append_op( @@ -271,20 +253,20 @@ class Adamax(Optimizer): beta1_pow_acc, self._beta1, 0.0, True ) beta1_pow_acc.copy_(tmp, False) - continue - with param.block.program._optimized_guard( - [param, grad] - ), name_scope('adamax'): - beta1_pow_acc = self._get_accumulator( - self._beta1_pow_acc_str, param - ) - block.append_op( - type="scale", - inputs={"X": beta1_pow_acc}, - outputs={"Out": beta1_pow_acc}, - attrs={"scale": self._beta1}, - stop_gradient=True, - ) + else: + with param.block.program._optimized_guard( + [param, grad] + ), name_scope('adamax'): + beta1_pow_acc = self._get_accumulator( + self._beta1_pow_acc_str, param + ) + block.append_op( + type="scale", + inputs={"X": beta1_pow_acc}, + outputs={"Out": beta1_pow_acc}, + attrs={"scale": self._beta1}, + stop_gradient=True, + ) else: for param, grad in parameters_and_grads['params']: if grad is None or param.stop_gradient is True: @@ -301,24 +283,23 @@ class Adamax(Optimizer): beta1_pow_acc, self._beta1, 0.0, True ) beta1_pow_acc.copy_(tmp, False) - continue - - with param.block.program._optimized_guard( - [param, grad] - ), name_scope('adamax'): - beta1_pow_acc = self._get_accumulator( - self._beta1_pow_acc_str, param - ) - self._beta1 = parameters_and_grads.get( - 'beta1', self._default_dict['beta1'] - ) - block.append_op( - type="scale", - inputs={"X": beta1_pow_acc}, - outputs={"Out": beta1_pow_acc}, - attrs={"scale": self._beta1}, - stop_gradient=True, - ) + else: + with param.block.program._optimized_guard( + [param, grad] + ), name_scope('adamax'): + beta1_pow_acc = self._get_accumulator( + self._beta1_pow_acc_str, param + ) + self._beta1 = parameters_and_grads.get( + 'beta1', self._default_dict['beta1'] + ) + block.append_op( + type="scale", + inputs={"X": beta1_pow_acc}, + outputs={"Out": beta1_pow_acc}, + attrs={"scale": self._beta1}, + stop_gradient=True, + ) def _update_param_group(self, parameters): self._beta1 = parameters.get('beta1', self._default_dict['beta1']) diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 71dbe345071a902131954341d29e38d2abbc42d6..ff0cb9fb841b5e10a06b675f44043a4e06a5231c 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -18,7 +18,7 @@ from collections.abc import Callable import paddle -from .. import _C_ops, _legacy_C_ops +from .. import _C_ops from ..fluid import core, framework, unique_name from ..fluid.clip import GradientClipBase from ..fluid.dygraph import base as imperative_base @@ -473,7 +473,7 @@ class AdamW(Optimizer): lr = self._create_param_lr(param_and_grad) # create the adamw optimize op - if framework._non_static_mode(): + if framework.in_dygraph_mode(): lr_ratio_ = ( 1.0 if self._lr_ratio is None @@ -491,126 +491,90 @@ class AdamW(Optimizer): else self._beta2.numpy().item(0) ) - if framework.in_dygraph_mode(): - found_inf = self._get_auxiliary_var('found_inf') - _, _, _, _, _, _ = _C_ops.adamw_( - param_and_grad[0], - param_and_grad[1], - lr, - moment1, - moment2, - beta1_pow_acc, - beta2_pow_acc, - master_weight, - found_inf, - _beta1, - _beta2, - self._epsilon, - lr_ratio_, - self._weight_decay, - with_decay, - self._lazy_mode, - 1000, - find_master, - False, - ) - else: - _, _, _, _, _, _ = _legacy_C_ops.adamw( - param_and_grad[0], - param_and_grad[1], - lr, - moment1, - moment2, - beta1_pow_acc, - beta2_pow_acc, - master_weight, - param_and_grad[0], - moment1, - moment2, - beta1_pow_acc, - beta2_pow_acc, - master_weight, - 'epsilon', - self._epsilon, - 'lazy_mode', - self._lazy_mode, - 'min_row_size_to_use_multithread', - 1000, - 'beta1', - _beta1, - 'beta2', - _beta2, - "with_decay", - with_decay, - 'coeff', - self._weight_decay, - 'multi_precision', - find_master, - 'lr_ratio', - lr_ratio_, - ) + found_inf = self._get_auxiliary_var('found_inf') + _, _, _, _, _, _ = _C_ops.adamw_( + param_and_grad[0], + param_and_grad[1], + lr, + moment1, + moment2, + beta1_pow_acc, + beta2_pow_acc, + master_weight, + found_inf, + _beta1, + _beta2, + self._epsilon, + lr_ratio_, + self._weight_decay, + with_decay, + self._lazy_mode, + 1000, + find_master, + False, + ) return None - - inputs = { - "Param": [param_and_grad[0]], - "Grad": [param_and_grad[1]], - "LearningRate": [lr], - "Moment1": [moment1], - "Moment2": [moment2], - "Beta1Pow": [beta1_pow_acc], - "Beta2Pow": [beta2_pow_acc], - } - - # Pass found_inf to adamw, to skip update for not only param, but also momentum and beta_pow - found_inf = self._get_auxiliary_var('found_inf') - - if found_inf: - inputs['SkipUpdate'] = found_inf - - outputs = { - "ParamOut": [param_and_grad[0]], - "Moment1Out": [moment1], - "Moment2Out": [moment2], - "Beta1PowOut": [beta1_pow_acc], - "Beta2PowOut": [beta2_pow_acc], - } - attrs = { - "lazy_mode": self._lazy_mode, - "min_row_size_to_use_multithread": 1000, - "multi_precision": find_master, - "with_decay": with_decay, - "coeff": self._weight_decay, - "lr_ratio": 1.0 - if self._lr_ratio is None - else self._lr_ratio(param_and_grad[0]), - } - - if isinstance(self._beta1, Variable): - inputs['Beta1Tensor'] = self._beta1 - else: - attrs['beta1'] = self._beta1 - if isinstance(self._beta2, Variable): - inputs['Beta2Tensor'] = self._beta2 else: - attrs['beta2'] = self._beta2 - if isinstance(self._epsilon, Variable): - inputs['EpsilonTensor'] = self._epsilon - else: - attrs['epsilon'] = self._epsilon - - if find_master: - inputs["MasterParam"] = master_weight - outputs["MasterParamOut"] = master_weight - - adamw_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) + inputs = { + "Param": [param_and_grad[0]], + "Grad": [param_and_grad[1]], + "LearningRate": [lr], + "Moment1": [moment1], + "Moment2": [moment2], + "Beta1Pow": [beta1_pow_acc], + "Beta2Pow": [beta2_pow_acc], + } + + # Pass found_inf to adamw, to skip update for not only param, but also momentum and beta_pow + found_inf = self._get_auxiliary_var('found_inf') + + if found_inf: + inputs['SkipUpdate'] = found_inf + + outputs = { + "ParamOut": [param_and_grad[0]], + "Moment1Out": [moment1], + "Moment2Out": [moment2], + "Beta1PowOut": [beta1_pow_acc], + "Beta2PowOut": [beta2_pow_acc], + } + attrs = { + "lazy_mode": self._lazy_mode, + "min_row_size_to_use_multithread": 1000, + "multi_precision": find_master, + "with_decay": with_decay, + "coeff": self._weight_decay, + "lr_ratio": 1.0 + if self._lr_ratio is None + else self._lr_ratio(param_and_grad[0]), + } + + if isinstance(self._beta1, Variable): + inputs['Beta1Tensor'] = self._beta1 + else: + attrs['beta1'] = self._beta1 + if isinstance(self._beta2, Variable): + inputs['Beta2Tensor'] = self._beta2 + else: + attrs['beta2'] = self._beta2 + if isinstance(self._epsilon, Variable): + inputs['EpsilonTensor'] = self._epsilon + else: + attrs['epsilon'] = self._epsilon + + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + + adamw_op = block.append_op( + type=self.type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True, + ) - return adamw_op + return adamw_op def __str__(self): return " ".join(["Weight Decay, params:", ",".join(self._params_name)]) diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py index decb779b6280ad5a5ff59d376800b80eec8a13c2..1e959a9ce471c6c330f21db8a3347cff0ea6ff48 100644 --- a/python/paddle/optimizer/lamb.py +++ b/python/paddle/optimizer/lamb.py @@ -13,7 +13,7 @@ # limitations under the License. import paddle -from paddle import _C_ops, _legacy_C_ops +from paddle import _C_ops from paddle.fluid.executor import global_scope from ..fluid import core, framework, unique_name @@ -313,76 +313,48 @@ class Lamb(Optimizer): find_master, ) return None - if framework._non_static_mode(): - _legacy_C_ops.lamb( - param_and_grad[0], - param_and_grad[1], - lr, - moment1, - moment2, - beta1_pow_acc, - beta2_pow_acc, - master_weight, - param_and_grad[0], - moment1, - moment2, - beta1_pow_acc, - beta2_pow_acc, - master_weight, - 'beta1', - self._beta1, - 'beta2', - self._beta2, - 'epsilon', - self._epsilon, - 'weight_decay', - weight_decay, - 'multi_precision', - find_master, + else: + # create the lamb optimize op + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": lr, + "Moment1": moment1, + "Moment2": moment2, + "Beta1Pow": beta1_pow_acc, + "Beta2Pow": beta2_pow_acc, + } + outputs = { + "ParamOut": param_and_grad[0], + "Moment1Out": moment1, + "Moment2Out": moment2, + "Beta1PowOut": beta1_pow_acc, + "Beta2PowOut": beta2_pow_acc, + } + attrs = { + "beta1": self._beta1, + "beta2": self._beta2, + "epsilon": self._epsilon, + "weight_decay": weight_decay, + "multi_precision": find_master, + } + + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + + if found_inf: + inputs["SkipUpdate"] = found_inf + + lamb_op = block.append_op( + type=self.type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True, ) - return None - - # create the lamb optimize op - inputs = { - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "LearningRate": lr, - "Moment1": moment1, - "Moment2": moment2, - "Beta1Pow": beta1_pow_acc, - "Beta2Pow": beta2_pow_acc, - } - outputs = { - "ParamOut": param_and_grad[0], - "Moment1Out": moment1, - "Moment2Out": moment2, - "Beta1PowOut": beta1_pow_acc, - "Beta2PowOut": beta2_pow_acc, - } - attrs = { - "beta1": self._beta1, - "beta2": self._beta2, - "epsilon": self._epsilon, - "weight_decay": weight_decay, - "multi_precision": find_master, - } - - if find_master: - inputs["MasterParam"] = master_weight - outputs["MasterParamOut"] = master_weight - - if found_inf: - inputs["SkipUpdate"] = found_inf - - lamb_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - return lamb_op + return lamb_op def _update_param_group(self, parameters): self._beta1 = parameters.get('beta1', self._default_dict['beta1']) diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py index 8230336d3d3be76f50a970ea53a23f8e08739ccd..258e69978a2ec6872cfea4975933d80386a2570a 100644 --- a/python/paddle/optimizer/lr.py +++ b/python/paddle/optimizer/lr.py @@ -20,8 +20,6 @@ import numpy import paddle.fluid.core as core from paddle import Tensor -from ..fluid.framework import _in_legacy_dygraph - __all__ = [ # noqa 'LRScheduler', 'NoamDecay', @@ -1395,15 +1393,8 @@ class ReduceOnPlateau(LRScheduler): else: self.last_epoch = epoch - if not _in_legacy_dygraph(): - tmp = core.eager.Tensor - else: - # need to declarate explicitly - from paddle.framework import VarBase as Tensor - - tmp = Tensor # loss must be float, numpy.ndarray or 1-D Tensor with shape [1] - if isinstance(metrics, (tmp, numpy.ndarray)): + if isinstance(metrics, (core.eager.Tensor, numpy.ndarray)): assert len(metrics.shape) == 1 and metrics.shape[0] == 1, ( "the metrics.shape " "should be (1L,), but the current metrics.shape is {}. Maybe that " diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py index bf9701ce1bc87568dfb7fafacf3e3ad3aead0fbf..1c5327b7d7841d815d3ab9dfa27bc67120a9b5e8 100644 --- a/python/paddle/optimizer/momentum.py +++ b/python/paddle/optimizer/momentum.py @@ -15,8 +15,8 @@ import warnings import paddle -from paddle import _C_ops, _legacy_C_ops -from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode +from paddle import _C_ops +from paddle.fluid.framework import in_dygraph_mode from paddle.fluid.regularizer import L2DecayRegularizer from ..fluid import core, framework, unique_name @@ -333,30 +333,6 @@ class Momentum(Optimizer): else None ) - if _in_legacy_dygraph(): - if isinstance(param_and_grad, dict): - self._update_regularization(param_and_grad['weight_decay']) - _, _, _ = _legacy_C_ops.momentum( - param_and_grad[0], - param_and_grad[1], - velocity_acc, - lr, - master_weight, - param_and_grad[0], - velocity_acc, - master_weight, - 'mu', - self._momentum, - 'use_nesterov', - self._use_nesterov, - 'regularization_method', - regularization_method, - 'regularization_coeff', - regularization_coeff, - 'multi_precision', - find_master, - ) - return None if in_dygraph_mode(): if isinstance(param_and_grad, dict): self._update_regularization(param_and_grad['weight_decay']) @@ -373,42 +349,42 @@ class Momentum(Optimizer): find_master, self._rescale_grad, ) + else: + attrs = { + "mu": self._momentum, + "use_nesterov": self._use_nesterov, + "regularization_method": regularization_method, + "regularization_coeff": regularization_coeff, + "multi_precision": find_master, + "rescale_grad": self._rescale_grad, + } + + inputs = { + "Param": [param_and_grad[0]], + "Grad": [param_and_grad[1]], + "Velocity": [velocity_acc], + "LearningRate": [lr], + } + + outputs = { + "ParamOut": [param_and_grad[0]], + "VelocityOut": [velocity_acc], + } + + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + + # create the momentum optimize op + momentum_op = block.append_op( + type=self.type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True, + ) - attrs = { - "mu": self._momentum, - "use_nesterov": self._use_nesterov, - "regularization_method": regularization_method, - "regularization_coeff": regularization_coeff, - "multi_precision": find_master, - "rescale_grad": self._rescale_grad, - } - - inputs = { - "Param": [param_and_grad[0]], - "Grad": [param_and_grad[1]], - "Velocity": [velocity_acc], - "LearningRate": [lr], - } - - outputs = { - "ParamOut": [param_and_grad[0]], - "VelocityOut": [velocity_acc], - } - - if find_master: - inputs["MasterParam"] = master_weight - outputs["MasterParamOut"] = master_weight - - # create the momentum optimize op - momentum_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - - return momentum_op + return momentum_op def _multi_tensor_init(self, target_block, parameters, param_group_idx): """ @@ -553,50 +529,20 @@ class Momentum(Optimizer): else None ) - if framework._non_static_mode(): - if in_dygraph_mode(): - _, _, _ = _C_ops.merged_momentum_( - self._param_dict[key][param_group_idx], - grad_dict[key], - self._velocity_dict[key][param_group_idx], - lr_dict[key], - master_weight, - self._momentum, - self._use_nesterov, - self._regularization_method_dict[key][ - param_group_idx - ], - self._regularization_coeff_dict[key][ - param_group_idx - ], - find_master, - self._rescale_grad, - ) - else: - _, _, _ = _legacy_C_ops.merged_momentum( - self._param_dict[key][param_group_idx], - grad_dict[key], - self._velocity_dict[key][param_group_idx], - lr_dict[key], - master_weight, - self._param_dict[key][param_group_idx], - self._velocity_dict[key][param_group_idx], - master_weight, - 'mu', - self._momentum, - 'use_nesterov', - self._use_nesterov, - 'regularization_method', - self._regularization_method_dict[key][ - param_group_idx - ], - 'regularization_coeff', - self._regularization_coeff_dict[key][ - param_group_idx - ], - 'multi_precision', - find_master, - ) + if in_dygraph_mode(): + _, _, _ = _C_ops.merged_momentum_( + self._param_dict[key][param_group_idx], + grad_dict[key], + self._velocity_dict[key][param_group_idx], + lr_dict[key], + master_weight, + self._momentum, + self._use_nesterov, + self._regularization_method_dict[key][param_group_idx], + self._regularization_coeff_dict[key][param_group_idx], + find_master, + self._rescale_grad, + ) else: inputs = { "Param": self._param_dict[key][param_group_idx], diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index d0ed4105f8adb91877f7ec180ed0f681fd0d2b5e..5d53593c2e039cb8be8a18ffc826bdb763737f79 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -18,13 +18,12 @@ from collections import defaultdict import numpy as np import paddle -from paddle import _C_ops, _legacy_C_ops +from paddle import _C_ops from paddle.fluid import core from paddle.fluid.framework import ( Variable, _current_expected_place, _in_eager_without_dygraph_check, - _in_legacy_dygraph, default_main_program, device_guard, in_dygraph_mode, @@ -534,17 +533,6 @@ class Optimizer: current_lr.dtype, place, ) - - elif _in_legacy_dygraph(): - _legacy_C_ops.fill_constant( - current_lr, - 'value', - float(value), - 'dtype', - current_lr.dtype, - 'shape', - list(current_lr.shape), - ) else: global_block = framework.default_main_program().global_block() global_block.append_op( @@ -1042,28 +1030,16 @@ class Optimizer: if self._dtype is None: self._dtype = loss.dtype - if framework._non_static_mode(): + if framework.in_dygraph_mode(): parameter_list = parameters if parameters else self._parameter_list - if framework.in_dygraph_mode(): - # It is very time-consuming to call c++ functions in a loop on the python side. - # We put this part of the code on the c++ side to improve the speed in eager mode. - params_grads = [] - grads = core.eager.get_all_grads(parameter_list) - for index, grad in enumerate(grads): - if grad is not None: - params_grads.append((parameter_list[index], grad)) - else: - # Keep the original code to support legacy mode. - # Delete the else branch when the legacy mode exits. - params_grads = [] - for param in parameter_list: - if param.stop_gradient: - continue - if param._grad_ivar() is not None: - # create gradient tensor - grad_var = param._grad_ivar() - params_grads.append((param, grad_var)) + # It is very time-consuming to call c++ functions in a loop on the python side. + # We put this part of the code on the c++ side to improve the speed in eager mode. + params_grads = [] + grads = core.eager.get_all_grads(parameter_list) + for index, grad in enumerate(grads): + if grad is not None: + params_grads.append((parameter_list[index], grad)) else: if callbacks is None: callbacks = [error_clip_callback] @@ -1207,28 +1183,26 @@ class Optimizer: if framework.in_dygraph_mode(): return _C_ops.add_n([grad, regularization_term]) - elif framework._in_legacy_dygraph(): - return _legacy_C_ops.sum([grad, regularization_term]) - - new_grad = grad - if grad.type == core.VarDesc.VarType.SELECTED_ROWS: - # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization, - # the grad's type and name will be changed. But the gradient's name - # is used in ParallelExecutor Reduce mode, so I add a flag for - # the new_grad here. - new_grad = grad.block.create_var( - name=grad.name + core.kNewGradSuffix(), - dtype=param.dtype, - shape=param.shape, - lod_level=param.lod_level, - type=core.VarDesc.VarType.LOD_TENSOR, - ) + else: + new_grad = grad + if grad.type == core.VarDesc.VarType.SELECTED_ROWS: + # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization, + # the grad's type and name will be changed. But the gradient's name + # is used in ParallelExecutor Reduce mode, so I add a flag for + # the new_grad here. + new_grad = grad.block.create_var( + name=grad.name + core.kNewGradSuffix(), + dtype=param.dtype, + shape=param.shape, + lod_level=param.lod_level, + type=core.VarDesc.VarType.LOD_TENSOR, + ) - inputs = {"X": [grad, regularization_term]} - outputs = {"Out": [new_grad]} - grad.block.append_op(type='sum', inputs=inputs, outputs=outputs) + inputs = {"X": [grad, regularization_term]} + outputs = {"Out": [new_grad]} + grad.block.append_op(type='sum', inputs=inputs, outputs=outputs) - return new_grad + return new_grad def append_regularization_ops( self, parameters_and_grads, regularization=None diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py index 7605baf4e3dd8442043831c98165d4aafdb56eb2..db85080834ccd6623351300f6432a9e1bb39152e 100644 --- a/python/paddle/optimizer/sgd.py +++ b/python/paddle/optimizer/sgd.py @@ -15,11 +15,11 @@ import warnings import paddle -from paddle import _C_ops, _legacy_C_ops +from paddle import _C_ops from ..fluid import core, framework, unique_name from ..fluid.dygraph import no_grad -from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode +from ..fluid.framework import in_dygraph_mode from ..fluid.layer_helper import LayerHelper from .optimizer import Optimizer @@ -166,42 +166,32 @@ class SGD(Optimizer): find_master, ) return None - if _in_legacy_dygraph(): - _legacy_C_ops.sgd( - param_and_grad[0], - lr, - param_and_grad[1], - master_weight, - param_and_grad[0], - master_weight, + else: + assert isinstance(block, framework.Block) + # create the optimize op + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": lr, + } + + outputs = {"ParamOut": param_and_grad[0]} + + attrs = {"multi_precision": find_master} + + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + + sgd_op = block.append_op( + type=self.type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True, ) - return None - - assert isinstance(block, framework.Block) - # create the optimize op - inputs = { - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "LearningRate": lr, - } - - outputs = {"ParamOut": param_and_grad[0]} - - attrs = {"multi_precision": find_master} - - if find_master: - inputs["MasterParam"] = master_weight - outputs["MasterParamOut"] = master_weight - - sgd_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - return sgd_op + return sgd_op def _update_param_group(self, parameters): parameters = parameters.get('params')