From 94365855d28b2139603611e421c98509e20f2a4c Mon Sep 17 00:00:00 2001 From: LoneRanger <836253168@qq.com> Date: Tue, 11 Jul 2023 18:12:32 +0800 Subject: [PATCH] =?UTF-8?q?replace=20the=20AdagradOptimizer=20=E3=80=81ada?= =?UTF-8?q?maxOptimizer=E3=80=81AdadeltaOptimizer=E3=80=81RMSPropOptimizer?= =?UTF-8?q?=E3=80=81LambOptimizer=20and=20Momentum=20(#54152)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * replace the AdadeltaOptimizer with Adadelta * replace the RMSPropOptimizer with RMSProp * replace the LambOptimizer with lamb * replace the momentum in contrib/optimizer.py with Momentum in python/paddle/optimizer/momentum.py * fix bug * fix bug * fix bug * fix bug of Lamp * fix bug of Lamp * fix bug of import * replace the AdamaxOptimizer with Admax and change the optimizer base for AdagradOptimizer * fix bug * fix bug * Update optimizer.py * fix bug * fix bug --- .../fleet/meta_optimizers/lamb_optimizer.py | 9 +- python/paddle/fluid/__init__.py | 2 - python/paddle/fluid/contrib/__init__.py | 22 - python/paddle/fluid/contrib/optimizer.py | 287 ---- python/paddle/fluid/optimizer.py | 1244 ++--------------- python/setup.py.in | 1 - setup.py | 1 - test/amp/test_amp_api.py | 2 +- test/book/notest_understand_sentiment.py | 2 +- .../contrib/test_image_classification_fp16.py | 4 +- .../test_reinforcement_learning.py | 4 +- test/dygraph_to_static/test_sentiment.py | 4 +- test/legacy_test/fleet_ps_training.py | 2 +- test/legacy_test/test_adadelta_op.py | 113 -- test/legacy_test/test_adagrad_op.py | 111 -- test/legacy_test/test_adamax_op.py | 109 -- test/legacy_test/test_case.py | 2 +- .../test_eager_deletion_dynamic_rnn_base.py | 2 +- test/legacy_test/test_imperative_optimizer.py | 35 +- .../test_imperative_optimizer_v2.py | 28 +- test/legacy_test/test_momentum_op.py | 14 +- test/legacy_test/test_optimizer.py | 137 -- test/legacy_test/test_regularizer.py | 6 +- test/legacy_test/test_regularizer_api.py | 6 +- test/legacy_test/test_rmsprop_op.py | 113 -- test/legacy_test/test_trainable.py | 4 +- test/legacy_test/test_weight_decay.py | 2 +- 27 files changed, 155 insertions(+), 2111 deletions(-) delete mode 100644 python/paddle/fluid/contrib/__init__.py delete mode 100644 python/paddle/fluid/contrib/optimizer.py diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py index 1a8c491fe48..9083c659bfc 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py @@ -13,8 +13,8 @@ import logging +import paddle from paddle.fluid.optimizer import AdamOptimizer -from paddle.fluid.optimizer import LambOptimizer as LAMB from .meta_optimizer_base import MetaOptimizerBase @@ -55,14 +55,13 @@ class LambOptimizer(MetaOptimizerBase): _exclude_from_weight_decay_fn = exclude_fn - self.lamb_opt = LAMB( + self.lamb_opt = paddle.optimizer.Lamb( learning_rate=opt._learning_rate, lamb_weight_decay=configs['lamb_weight_decay'], beta1=opt._beta1, beta2=opt._beta2, epsilon=opt._epsilon, - parameter_list=opt._parameter_list, - regularization=opt.regularization, + parameters=opt._parameter_list, grad_clip=opt._grad_clip, exclude_from_weight_decay_fn=_exclude_from_weight_decay_fn, name=opt._name, @@ -111,7 +110,7 @@ class LambOptimizer(MetaOptimizerBase): return self.lamb_opt.apply_gradients(params_grads=params_grads) def apply_optimize(self, loss, startup_program, params_grads): - return self.lamb_opt.apply_optimize( + return self.lamb_opt._apply_optimize( loss, startup_program=startup_program, params_grads=params_grads ) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 646ae72f6c2..d677d4e8d70 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -53,7 +53,6 @@ from . import initializer from .initializer import set_global_initializer from . import layers from . import dygraph -from . import contrib from . import optimizer from . import backward from .backward import gradients @@ -105,7 +104,6 @@ __all__ = ( 'io', 'initializer', 'layers', - 'contrib', 'dygraph', 'enable_dygraph', 'disable_dygraph', diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py deleted file mode 100644 index c1f884fdb4a..00000000000 --- a/python/paddle/fluid/contrib/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# Copyright (c) 2021 NVIDIA Corporation. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from . import optimizer -from .optimizer import * - -__all__ = [] - -__all__ += optimizer.__all__ diff --git a/python/paddle/fluid/contrib/optimizer.py b/python/paddle/fluid/contrib/optimizer.py deleted file mode 100644 index e97b9d8c49c..00000000000 --- a/python/paddle/fluid/contrib/optimizer.py +++ /dev/null @@ -1,287 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from paddle.fluid.optimizer import Optimizer -from paddle.regularizer import L1Decay -from paddle.regularizer import L2Decay -from paddle.fluid import core -from paddle.fluid import framework -from paddle.fluid.framework import program_guard -from paddle.fluid import unique_name -from paddle.fluid import layers -from paddle.fluid.layer_helper import LayerHelper -import warnings -from paddle import _C_ops, _legacy_C_ops - -__all__ = ['Momentum'] - - -class Momentum(Optimizer): - r""" - - Simple Momentum optimizer with velocity state - - This optimizer has a flag for Nestrov Momentum. - - The update equations are as follows: - - .. math:: - - & velocity = mu * velocity + gradient - - & if (use\_nesterov): - - &\quad param = param - (gradient + mu * velocity) * learning\_rate - - & else: - - &\quad param = param - learning\_rate * velocity - - Parameters: - learning_rate (float|Variable): The learning rate used to update parameters. \ - Can be a float value or a Variable with one float value as data element. - momentum (float): Momentum factor - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static graph mode, at this time all parameters will be updated. - use_nesterov (bool, optional): Enables Nesterov momentum, default is false. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false. - rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \ - Often choose to be ``1.0/batch_size``. - name (str, optional): This parameter is used by developers to print debugging information. \ - For details, please refer to :ref:`api_guide_Name`. Default is None. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - - paddle.enable_static() - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = paddle.static.data(name='x', shape=[1, 13], dtype='float32') - y = paddle.static.data(name='y', shape=[1], dtype='float32') - linear = paddle.nn.Linear(13, 1) - y_predict = linear(x) - cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) - avg_cost = paddle.mean(cost) - - moment_optimizer = fluid.contrib.optimizer.Momentum(learning_rate=0.001, momentum=0.9) - moment_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(paddle.static.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) - - """ - _velocity_acc_str = "velocity" - - def __init__( - self, - learning_rate, - momentum, - parameter_list=None, - use_nesterov=False, - regularization=None, - grad_clip=None, - multi_precision=False, - rescale_grad=1.0, - name=None, - ): - assert learning_rate is not None - assert momentum is not None - predicate = lambda regular: isinstance(regular, L2Decay) - py_regular = None if predicate(regularization) else regularization - super().__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=py_regular, - grad_clip=grad_clip, - name=name, - ) - self.type = "momentum" - self._momentum = momentum - self._use_nesterov = bool(use_nesterov) - self._regularization_method = "" - self._regularization_coeff = 0 - if isinstance(regularization, L2Decay): - self._regularization_method = "l2_decay" - self._regularization_coeff = regularization._coeff - self._multi_precision = multi_precision - self._rescale_grad = rescale_grad - self._master_weights = {} - - def _create_master_weight(self, param): - assert isinstance(self.helper, LayerHelper) - - var_name = param.name + "_fp32_master" - var_name = unique_name.generate(var_name) - var = paddle.static.create_global_var( - name=var_name, - shape=param.shape, - value=0, - dtype='float32', - persistable=True, - ) - block = self.helper.startup_program.global_block() - block.append_op( - type="cast", - inputs={"X": [param]}, - outputs={"Out": [var]}, - attrs={ - "in_dtype": param.dtype, - "out_dtype": core.VarDesc.VarType.FP32, - }, - ) - self._master_weights[param.name] = var - return var - - def _get_accumulator(self, name, param): - """Utility function to fetch an accumulator for a parameter - - Args: - name: name of the accumulator - param: parameter variable for which accumulator is to be fetched - - Returns: - accumulator variable for the parameter - """ - if self._name is not None: - name = self._name + "_" + name - find_master = ( - self._multi_precision and param.dtype == core.VarDesc.VarType.FP16 - ) - target_param = ( - self._master_weights[param.name] if find_master else param - ) - target_name = target_param.name - if ( - name not in self._accumulators - or target_name not in self._accumulators[name] - ): - raise Exception( - "Accumulator {} does not exist for parameter {}".format( - name, target_name - ) - ) - return self._accumulators[name][target_name] - - def _create_accumulators(self, block, parameters): - assert isinstance(block, framework.Block) - - for p in parameters: - if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: - master_p = self._create_master_weight(p) - self._add_accumulator(self._velocity_acc_str, master_p) - continue - if ( - p.dtype == core.VarDesc.VarType.FP16 - and not self._multi_precision - ): - warnings.warn( - "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." - "Consider using multi_precision=True option of the Momentum optimizer." - ) - self._add_accumulator(self._velocity_acc_str, p) - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - velocity_acc = self._get_accumulator( - self._velocity_acc_str, param_and_grad[0] - ) - lr = self._create_param_lr(param_and_grad) - - find_master = ( - self._multi_precision - and param_and_grad[0].dtype == core.VarDesc.VarType.FP16 - ) - master_weight = ( - self._master_weights[param_and_grad[0].name] - if find_master - else None - ) - - if framework.in_dygraph_mode(): - _, _, _ = _legacy_C_ops.momentum( - param_and_grad[0], - param_and_grad[1], - velocity_acc, - lr, - master_weight, - param_and_grad[0], - velocity_acc, - master_weight, - 'mu', - self._momentum, - 'use_nesterov', - self._use_nesterov, - 'regularization_method', - self._regularization_method, - 'regularization_coeff', - self._regularization_coeff, - 'multi_precision', - find_master, - ) - return None - - attrs = { - "mu": self._momentum, - "use_nesterov": self._use_nesterov, - "regularization_method": self._regularization_method, - "regularization_coeff": self._regularization_coeff, - "multi_precision": find_master, - "rescale_grad": self._rescale_grad, - } - inputs = { - "Param": [param_and_grad[0]], - "Grad": [param_and_grad[1]], - "Velocity": [velocity_acc], - "LearningRate": [lr], - } - outputs = { - "ParamOut": [param_and_grad[0]], - "VelocityOut": [velocity_acc], - } - - if find_master: - inputs["MasterParam"] = master_weight - outputs["MasterParamOut"] = master_weight - - # create the momentum optimize op - momentum_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - - return momentum_op diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 113170ae59d..b46f3da08cb 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -62,27 +62,19 @@ from ..fluid.framework import ( __all__ = [ 'SGD', 'Momentum', - 'Adagrad', 'Adam', - 'Adamax', 'Dpsgd', 'DecayedAdagrad', 'Ftrl', 'SGDOptimizer', 'MomentumOptimizer', - 'AdagradOptimizer', 'AdamOptimizer', - 'AdamaxOptimizer', 'DpsgdOptimizer', 'DecayedAdagradOptimizer', - 'RMSPropOptimizer', 'FtrlOptimizer', - 'Adadelta', - 'AdadeltaOptimizer', 'ModelAverage', 'LarsMomentum', 'LarsMomentumOptimizer', - 'LambOptimizer', 'ExponentialMovingAverage', 'PipelineOptimizer', 'LookaheadOptimizer', @@ -1983,181 +1975,6 @@ class LarsMomentumOptimizer(Optimizer): return momentum_op -class AdagradOptimizer(Optimizer): - r""" - The Adaptive Gradient optimizer (Adagrad for short) can adaptively assign - different learning rates to individual parameters. - - The parameter ``param_out`` update rule with gradient ``grad``: - - .. math:: - - moment\_out &= moment + grad * grad - - param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} - - Related paper: `Adaptive Subgradient Methods for Online Learning and - Stochastic Optimization `_. - - The original paper does not have the ``epsilon`` attribute. It is added here - in our implementation as also proposed `Per-parameter adaptive learning rate - methods `_ - for numerical stability to avoid the division by zero error. - - Args: - learning_rate (float|Variable): The learning rate used to update ``Parameter``. - It can be a float value or a ``Variable`` with a float type. - epsilon (float, optional): A small float value for numerical stability. - The default value is 1e-06. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static graph mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - The default value is None. - initial_accumulator_value (float, optional): Initial value for moment accumulator. - The default value is 0.0. - - Examples: - .. code-block:: python - - import paddle - import numpy as np - import paddle.fluid as fluid - - paddle.enable_static() - np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - inp = paddle.static.data(name="inp", shape=[2, 2], dtype="float32") - out = paddle.static.nn.fc(inp, size=3) - out = paddle.sum(out) - optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.2) - optimizer.minimize(out) - - exe = fluid.Executor(fluid.CPUPlace()) - exe.run(fluid.default_startup_program()) - exe.run( - feed={"inp": np_inp}, - fetch_list=[out.name]) - """ - _moment_acc_str = "moment" - - def __init__( - self, - learning_rate, - epsilon=1.0e-6, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None, - initial_accumulator_value=0.0, - ): - assert learning_rate is not None - assert epsilon is not None - super().__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name, - ) - self.type = "adagrad" - self._multi_precision = False - self._epsilon = epsilon - self.initial_accumulator_value = initial_accumulator_value - self._master_weights = {} - - def _create_accumulators(self, block, parameters): - assert isinstance(block, framework.Block) - - for p in parameters: - if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype): - master_p = self._create_master_weight(p) - self._add_accumulator( - self._moment_acc_str, - master_p, - fill_value=self.initial_accumulator_value, - ) - continue - if ( - self._is_dtype_fp16_or_bf16(p.dtype) - and not self._multi_precision - ): - warnings.warn( - "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence." - "Consider using multi_precision=True option of the Lars optimizer." - ) - self._add_accumulator( - self._moment_acc_str, - p, - fill_value=self.initial_accumulator_value, - ) - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - moment_acc = self._get_accumulator_master( - self._moment_acc_str, param_and_grad[0] - ) - - find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( - param_and_grad[0].dtype - ) - master_weight = ( - self._master_weights[param_and_grad[0].name] - if find_master - else None - ) - - if in_dygraph_mode(): - _C_ops.adagrad_( - param_and_grad[0], - param_and_grad[1], - moment_acc, - self._create_param_lr(param_and_grad), - master_weight, - self._epsilon, - find_master, - ) - return None - else: - # Create the adagrad optimizer op - inputs = { - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Moment": moment_acc, - "LearningRate": self._create_param_lr(param_and_grad), - } - outputs = { - "ParamOut": param_and_grad[0], - "MomentOut": moment_acc, - } - - attrs = {"epsilon": self._epsilon, "multi_precision": find_master} - - if find_master: - inputs["MasterParam"] = master_weight - outputs["MasterParamOut"] = master_weight - - adagrad_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - - return adagrad_op - - class AdamOptimizer(Optimizer): r""" The Adam optimizer uses an optimization described at the end @@ -2586,59 +2403,10 @@ class AdamOptimizer(Optimizer): ) -class AdamaxOptimizer(Optimizer): +class DpsgdOptimizer(Optimizer): r""" - The Adamax optimizer is implemented based on the Adamax Optimization - in Section 7 of `Adam paper `_. - The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm, - which makes the learning rate update algorithm more stable and simple. - - The parameter ``param_out`` update rule with gradient ``grad``: - - .. math:: - - t & = t + 1 - - moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad - - inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|) - - learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t} - - param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out} - - Related paper: `Adam: A Method for Stochastic Optimization `_ - - The original paper does not have an ``epsilon`` attribute, - it is added here for numerical stability to prevent the division by 0 error. - - Args: - learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Variable`` with a float type. The default value is 0.001. - beta1 (float, optional): The exponential decay rate for the 1st moment estimates. - The default value is 0.9. - beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. - The default value is 0.999. - epsilon (float, optional): A small float value for numerical stability. - The default value is 1e-08. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static graph mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - The default value is None. - - **Notes**: - **Currently, AdamaxOptimizer doesn't support sparse parameter optimization.** + We implement the Dpsgd optimizer according to CCS16 paper - + Deep Learning with Differential Privacy. Examples: .. code-block:: python @@ -2655,11 +2423,11 @@ class AdamaxOptimizer(Optimizer): train_program = fluid.Program() startup_program = fluid.Program() with fluid.program_guard(train_program, startup_program): - data = paddle.static.data(name='X', shape=[None, 1], dtype='float32') + data = paddle.static.data(name='X', shape=[-1,1], dtype='float32') hidden = paddle.static.nn.fc(x=data, size=10) loss = paddle.mean(hidden) - adam = fluid.optimizer.AdamaxOptimizer(learning_rate=0.2) - adam.minimize(loss) + optimizer = fluid.optimizer.Dpsgd(learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0) + optimizer.minimize(loss) # Run the startup program once and only once. exe.run(startup_program) @@ -2668,240 +2436,46 @@ class AdamaxOptimizer(Optimizer): outs = exe.run(program=train_program, feed={'X': x}, fetch_list=[loss.name]) + + Args: + learning_rate (float|Variable): the learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. + clip (float): clipping threshold + batch_size (float): batch size. + sigma (float): for gaussian noise. + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static graph mode, at this time all parameters will be updated. + Notes: + Currently, DpsgdOptimizer doesn't support sparse parameter optimization. """ - _moment_acc_str = "moment" - _inf_norm_acc_str = "inf_norm" - _beta1_pow_acc_str = "beta1_pow_acc" def __init__( self, learning_rate=0.001, - beta1=0.9, - beta2=0.999, - epsilon=1e-8, + clip=0.9, + batch_size=0.999, + sigma=1e-8, parameter_list=None, - regularization=None, - grad_clip=None, - name=None, ): assert learning_rate is not None - assert beta1 is not None - assert beta2 is not None - assert epsilon is not None + assert clip is not None + assert batch_size is not None + assert sigma is not None super().__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name, + learning_rate=learning_rate, parameter_list=parameter_list ) - self.type = "adamax" - self._beta1 = beta1 - self._beta2 = beta2 - self._epsilon = epsilon - self._multi_precision = False - self._master_weights = {} - - def _create_accumulators(self, block, parameters): - # Create accumulator tensors for first moment and infinity norm - for p in parameters: - if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype): - master_p = self._create_master_weight(p) - self._add_accumulator(self._moment_acc_str, master_p) - self._add_accumulator(self._inf_norm_acc_str, master_p) - self._add_accumulator( - name=self._beta1_pow_acc_str, - param=master_p, - fill_value=self._beta1, - shape=[1], - ) - continue - if ( - self._is_dtype_fp16_or_bf16(p.dtype) - and not self._multi_precision - ): - warnings.warn( - "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence." - "Consider using multi_precision=True option of the Lars optimizer." - ) - self._add_accumulator(self._moment_acc_str, p) - self._add_accumulator(self._inf_norm_acc_str, p) - self._add_accumulator( - name=self._beta1_pow_acc_str, - param=p, - fill_value=self._beta1, - shape=[1], - ) - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - moment = self._get_accumulator_master( - self._moment_acc_str, param_and_grad[0] - ) - inf_norm = self._get_accumulator_master( - self._inf_norm_acc_str, param_and_grad[0] - ) - beta1_pow_acc = self._get_accumulator_master( - self._beta1_pow_acc_str, param_and_grad[0] - ) - - find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( - param_and_grad[0].dtype - ) - master_weight = ( - self._master_weights[param_and_grad[0].name] - if find_master - else None - ) - if in_dygraph_mode(): - _C_ops.adamax_( - param_and_grad[0], - param_and_grad[1], - self._create_param_lr(param_and_grad), - moment, - inf_norm, - beta1_pow_acc, - master_weight, - self._beta1, - self._beta2, - self._epsilon, - find_master, - ) - else: - # create the adamax optimize op - inputs = { - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "LearningRate": self._create_param_lr(param_and_grad), - "Moment": moment, - "InfNorm": inf_norm, - "Beta1Pow": beta1_pow_acc, - } - outputs = { - "ParamOut": param_and_grad[0], - "MomentOut": moment, - "InfNormOut": inf_norm, - } - if find_master: - inputs["MasterParam"] = master_weight - outputs["MasterParamOut"] = master_weight - - attrs = { - "beta1": self._beta1, - "beta2": self._beta2, - "epsilon": self._epsilon, - "multi_precision": find_master, - } - - adamax_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - - return adamax_op - - def _finish_update(self, block, parameters_and_grads): - """Update Beta1 Power accumulator""" - assert isinstance(block, framework.Block) - for param, grad in parameters_and_grads: - if grad is None or param.trainable is False: - continue - with param.block.program._optimized_guard( - [param, grad] - ), name_scope('adamx'): - beta1_pow_acc = self._get_accumulator_master( - self._beta1_pow_acc_str, param - ) - if in_dygraph_mode(): - tmp = _C_ops.scale(beta1_pow_acc, self._beta1, 0.0, True) - beta1_pow_acc.copy_(tmp, False) - else: - block.append_op( - type="scale", - inputs={"X": beta1_pow_acc}, - outputs={"Out": beta1_pow_acc}, - attrs={"scale": self._beta1}, - stop_gradient=True, - ) - - -class DpsgdOptimizer(Optimizer): - r""" - We implement the Dpsgd optimizer according to CCS16 paper - - Deep Learning with Differential Privacy. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy - import paddle - paddle.enable_static() - - # First create the Executor. - place = fluid.CPUPlace() # fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - train_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - data = paddle.static.data(name='X', shape=[-1,1], dtype='float32') - hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.mean(hidden) - optimizer = fluid.optimizer.Dpsgd(learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0) - optimizer.minimize(loss) - - # Run the startup program once and only once. - exe.run(startup_program) - - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - - Args: - learning_rate (float|Variable): the learning rate used to update parameters. \ - Can be a float value or a Variable with one float value as data element. - clip (float): clipping threshold - batch_size (float): batch size. - sigma (float): for gaussian noise. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static graph mode, at this time all parameters will be updated. - Notes: - Currently, DpsgdOptimizer doesn't support sparse parameter optimization. - """ - - def __init__( - self, - learning_rate=0.001, - clip=0.9, - batch_size=0.999, - sigma=1e-8, - parameter_list=None, - ): - assert learning_rate is not None - assert clip is not None - assert batch_size is not None - assert sigma is not None - super().__init__( - learning_rate=learning_rate, parameter_list=parameter_list - ) - self.type = "dpsgd" - self._clip = clip - self._batch_size = batch_size - self._sigma = sigma - ''' - Note(wangzhongpu): - This property is only used for debugging, do not need to set it! - Dpsgd operator use time(NULL) as random seed to generate random number. - However, during debugging, we need determinated result, so we will set self._seed to a fixed number. - ''' - self._seed = None + self.type = "dpsgd" + self._clip = clip + self._batch_size = batch_size + self._sigma = sigma + ''' + Note(wangzhongpu): + This property is only used for debugging, do not need to set it! + Dpsgd operator use time(NULL) as random seed to generate random number. + However, during debugging, we need determinated result, so we will set self._seed to a fixed number. + ''' + self._seed = None def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) @@ -2949,371 +2523,29 @@ class DpsgdOptimizer(Optimizer): class DecayedAdagradOptimizer(Optimizer): r""" The Decayed Adagrad optimizer can be seen as an Adagrad algorithm that introduces - the decay rate to solve the problem of a sharp drop in the learning rate - during model training when using the AdagradOptimizer. - - The parameter ``param_out`` update rule with gradient ``grad``: - - .. math:: - - moment\_out & = decay * moment + (1 - decay) * grad * grad - - param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} - - Related paper: `Adaptive Subgradient Methods for Online Learning and Stochastic - Optimization `_. - - The original paper does not have an ``epsilon`` attribute. It is added here for numerical - stability to avoid the division by zero error. - - Args: - learning_rate (float|Variable): The learning rate used to update ``Parameter``. - It can be a float value or a ``Variable`` with a float type. - decay (float, optional): The decay rate. The default value is 0.95. - epsilon (float, optional): A small float value for numerical stability. - The default value is 1e-06. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static graph mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - The default value is None. - - **Notes**: - **Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.** - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - - paddle.enable_static() - x = paddle.static.data(name='x', shape=[None, 10], dtype='float32') - trans = paddle.static.nn.fc(x, 100) - cost = paddle.mean(trans) - optimizer = fluid.optimizer.DecayedAdagradOptimizer(learning_rate=0.2) - optimizer.minimize(cost) - """ - _moment_acc_str = "moment" - - def __init__( - self, - learning_rate, - decay=0.95, - epsilon=1.0e-6, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None, - ): - assert learning_rate is not None - assert decay is not None - assert epsilon is not None - - super().__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name, - ) - self.type = "decayed_adagrad" - self._decay = decay - self._epsilon = epsilon - - def _create_accumulators(self, block, parameters): - assert isinstance(block, framework.Block) - - for p in parameters: - self._add_accumulator(self._moment_acc_str, p) - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - moment_acc = self._get_accumulator( - self._moment_acc_str, param_and_grad[0] - ) - - if in_dygraph_mode(): - _legacy_C_ops.decayed_adagrad( - param_and_grad[0], - param_and_grad[1], - moment_acc, - self._create_param_lr(param_and_grad), - param_and_grad[0], - moment_acc, - "epsilon", - self._epsilon, - "decay", - self._decay, - ) - else: - # Create the decayed adagrad optimizer op - decayed_adagrad_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Moment": moment_acc, - "LearningRate": self._create_param_lr(param_and_grad), - }, - outputs={ - "ParamOut": param_and_grad[0], - "MomentOut": moment_acc, - }, - attrs={"epsilon": self._epsilon, "decay": self._decay}, - stop_gradient=True, - ) - - return decayed_adagrad_op - - -class AdadeltaOptimizer(Optimizer): - r""" - **Notes: This API does not support sparse parameter optimization.** - - Adadelta Optimizer. Please refer to this for details: - `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD `_. - - The update is done as follows: - - .. math:: - - E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 - - learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) } - - E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2 - - Args: - learning_rate (float|Variable): global learning rate. - epsilon (float): a small float number for numeric stability. Default 1.0e-6. - rho (float): a floating point value indicating the decay rate. Default 0.95. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static graph mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): The default value is None. Normally there is no need for user - to set this property. For more information, please refer to - :ref:`api_guide_Name` . - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - - paddle.enable_static() - image = paddle.static.data(name='image', shape=[None, 28], dtype='float32') - fc = paddle.static.nn.fc(image, size=10) - cost = paddle.mean(fc) - optimizer = fluid.optimizer.Adadelta( - learning_rate=0.0003, epsilon=1.0e-6, rho=0.95) - - # optimizer_ops is a list of optimizer operators to update parameters - # params_grads is a list of (param, param_grad), where param is each - # parameter and param_grad is the gradient variable of param. - optimizer_ops, params_grads = optimizer.minimize(cost) - """ - - _avg_squared_grad_acc_str = "_avg_squared_grad" - _avg_squared_update_acc_str = "_avg_squared_update" - - def __init__( - self, - learning_rate, - epsilon=1.0e-6, - rho=0.95, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None, - ): - if learning_rate is None: - raise ValueError("learning_rate is not set.") - if epsilon is None: - raise ValueError("epsilon is not set.") - if rho is None: - raise ValueError("rho is not set.") - super().__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name, - ) - self.type = "adadelta" - self._multi_precision = False - self._master_weights = {} - self._epsilon = epsilon - self._rho = rho - - def _create_accumulators(self, block, parameters): - if not isinstance(block, framework.Block): - raise TypeError("block is not instance of framework.Block.") - - for p in parameters: - if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype): - master_p = self._create_master_weight(p) - self._add_accumulator(self._avg_squared_grad_acc_str, master_p) - self._add_accumulator( - self._avg_squared_update_acc_str, master_p - ) - continue - if ( - self._is_dtype_fp16_or_bf16(p.dtype) - and not self._multi_precision - ): - warnings.warn( - "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence." - "Consider using multi_precision=True option of the Lars optimizer." - ) - self._add_accumulator(self._avg_squared_grad_acc_str, p) - self._add_accumulator(self._avg_squared_update_acc_str, p) - - def _append_optimize_op(self, block, param_and_grad): - if not isinstance(block, framework.Block): - raise TypeError("block is not instance of framework.Block.") - - avg_squared_grad_acc = self._get_accumulator_master( - self._avg_squared_grad_acc_str, param_and_grad[0] - ) - avg_squared_update_acc = self._get_accumulator_master( - self._avg_squared_update_acc_str, param_and_grad[0] - ) - - find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( - param_and_grad[0].dtype - ) - master_weight = ( - self._master_weights[param_and_grad[0].name] - if find_master - else None - ) - - if in_dygraph_mode(): - _C_ops.adadelta_( - param_and_grad[0], - param_and_grad[1], - avg_squared_grad_acc, - avg_squared_update_acc, - self._create_param_lr(param_and_grad), - master_weight, - self._rho, - self._epsilon, - find_master, - ) - else: - # Create the adadelta optimizer op - inputs = { - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "AvgSquaredGrad": avg_squared_grad_acc, - "AvgSquaredUpdate": avg_squared_update_acc, - "LearningRate": self._create_param_lr(param_and_grad), - } - outputs = { - "ParamOut": param_and_grad[0], - "AvgSquaredGradOut": avg_squared_grad_acc, - "AvgSquaredUpdateOut": avg_squared_update_acc, - } - - if find_master: - inputs["MasterParam"] = master_weight - outputs["MasterParamOut"] = master_weight - - adadelta_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs={ - "epsilon": self._epsilon, - "rho": self._rho, - "multi_precision": find_master, - }, - stop_gradient=True, - ) - - return adadelta_op - - -class RMSPropOptimizer(Optimizer): - r""" - Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning - rate method. The original slides proposed RMSProp: Slide 29 of - http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf . - - The original equation is as follows: - - .. math:: - - r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 - - w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w) - - The first equation calculates moving average of the squared gradient for - each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`. - - In some cases, adding a momentum term :math: `\\beta` is beneficial. - In our implementation, Nesterov momentum is used: - - .. math:: - - r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 - - v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) + - \\epsilon}} \\nabla Q_{i}(w) - - w & = w - v(w, t) - - if centered is True: - - .. math:: + the decay rate to solve the problem of a sharp drop in the learning rate + during model training when using the AdagradOptimizer. - r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 + The parameter ``param_out`` update rule with gradient ``grad``: - g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w) + .. math:: - v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 + - \\epsilon}} \\nabla Q_{i}(w) + moment\_out & = decay * moment + (1 - decay) * grad * grad - w & = w - v(w, t) + param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} - where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95 - and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a - smoothing term to avoid division by zero, usually set somewhere in range - from 1e-4 to 1e-8. + Related paper: `Adaptive Subgradient Methods for Online Learning and Stochastic + Optimization `_. + The original paper does not have an ``epsilon`` attribute. It is added here for numerical + stability to avoid the division by zero error. - Parameters: - learning_rate(float): Global learning rate. - rho(float): rho is :math: `\\rho` in equation, default is 0.95. - epsilon(float): :math: `\\epsilon` in equation is smoothing term to - avoid division by zero, default is 1e-6. - momentum(float): :math:`\\beta` in equation is the momentum term, - default is 0.0. - centered(bool): If True, gradients are normalized by the estimated variance of - the gradient; if False, by the uncentered second moment. Setting this to - True may help with training, but is slightly more expensive in terms of - computation and memory. Defaults to False. + Args: + learning_rate (float|Variable): The learning rate used to update ``Parameter``. + It can be a float value or a ``Variable`` with a float type. + decay (float, optional): The decay rate. The default value is 0.95. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-06. parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static graph mode, at this time all parameters will be updated. @@ -3326,59 +2558,42 @@ class RMSPropOptimizer(Optimizer): some derived class of ``GradientClipBase`` . There are three cliping strategies ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): This parameter is used by developers to print debugging information. \ - For details, please refer to :ref:`api_guide_Name`. Default is None. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. - Raises: - ValueError: If learning_rate, rho, epsilon, momentum are None. + **Notes**: + **Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.** Examples: - .. code-block:: python + .. code-block:: python import paddle import paddle.fluid as fluid - import numpy as np paddle.enable_static() - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = paddle.static.data(name='x', shape=[-1, 13], dtype='float32') - y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') - y_predict = paddle.static.nn.fc(x, size=1, activation=None) - cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) - avg_cost = paddle.mean(cost) - - rms_optimizer = fluid.optimizer.RMSProp(learning_rate=0.1) - rms_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) - + x = paddle.static.data(name='x', shape=[None, 10], dtype='float32') + trans = paddle.static.nn.fc(x, 100) + cost = paddle.mean(trans) + optimizer = fluid.optimizer.DecayedAdagradOptimizer(learning_rate=0.2) + optimizer.minimize(cost) """ - - _momentum_acc_str = "momentum" - _mean_square_acc_str = "mean_square" - _mean_grad_acc_str = "mean_grad" + _moment_acc_str = "moment" def __init__( self, learning_rate, - rho=0.95, + decay=0.95, epsilon=1.0e-6, - momentum=0.0, - centered=False, parameter_list=None, regularization=None, grad_clip=None, name=None, ): + assert learning_rate is not None + assert decay is not None + assert epsilon is not None + super().__init__( learning_rate=learning_rate, parameter_list=parameter_list, @@ -3386,119 +2601,55 @@ class RMSPropOptimizer(Optimizer): grad_clip=grad_clip, name=name, ) - if learning_rate is None: - raise ValueError("learning_rate is not set.") - if rho is None: - raise ValueError("rho is not set.") - if epsilon is None: - raise ValueError("epsilon is not set.") - if momentum is None: - raise ValueError("momentum is not set.") - - self.type = "rmsprop" - self._rho = rho + self.type = "decayed_adagrad" + self._decay = decay self._epsilon = epsilon - self._momentum = momentum - self._centered = centered - self._multi_precision = False - self._master_weights = {} def _create_accumulators(self, block, parameters): - if not isinstance(block, framework.Block): - raise TypeError("block is not instance of framework.Block.") + assert isinstance(block, framework.Block) for p in parameters: - if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype): - master_p = self._create_master_weight(p) - self._add_accumulator(self._momentum_acc_str, master_p) - self._add_accumulator(self._mean_square_acc_str, master_p) - self._add_accumulator(self._mean_grad_acc_str, master_p) - continue - if ( - self._is_dtype_fp16_or_bf16(p.dtype) - and not self._multi_precision - ): - warnings.warn( - "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence." - "Consider using multi_precision=True option of the Lars optimizer." - ) - self._add_accumulator(self._momentum_acc_str, p) - self._add_accumulator(self._mean_square_acc_str, p) - self._add_accumulator(self._mean_grad_acc_str, p) + self._add_accumulator(self._moment_acc_str, p) def _append_optimize_op(self, block, param_and_grad): - if not isinstance(block, framework.Block): - raise TypeError("block is not instance of framework.Block.") + assert isinstance(block, framework.Block) - momentum_acc = self._get_accumulator_master( - self._momentum_acc_str, param_and_grad[0] - ) - mean_square_acc = self._get_accumulator_master( - self._mean_square_acc_str, param_and_grad[0] - ) - mean_grad_acc = self._get_accumulator_master( - self._mean_grad_acc_str, param_and_grad[0] - ) - find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( - param_and_grad[0].dtype - ) - master_weight = ( - self._master_weights[param_and_grad[0].name] - if find_master - else None + moment_acc = self._get_accumulator( + self._moment_acc_str, param_and_grad[0] ) + if in_dygraph_mode(): - _C_ops.rmsprop_( + _legacy_C_ops.decayed_adagrad( param_and_grad[0], - mean_square_acc, param_and_grad[1], - momentum_acc, + moment_acc, self._create_param_lr(param_and_grad), - mean_grad_acc, - master_weight, + param_and_grad[0], + moment_acc, + "epsilon", self._epsilon, - self._rho, - self._momentum, - self._centered, - find_master, + "decay", + self._decay, ) - return None else: - inputs = { - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Moment": momentum_acc, - "MeanSquare": mean_square_acc, - "MeanGrad": mean_grad_acc, - "LearningRate": self._create_param_lr(param_and_grad), - } - - outputs = { - "ParamOut": param_and_grad[0], - "MomentOut": momentum_acc, - "MeanSquareOut": mean_square_acc, - "MeanGradOut": mean_grad_acc, - } - - if find_master: - inputs["MasterParam"] = master_weight - outputs["MasterParamOut"] = master_weight - - rmsprop_op = block.append_op( + # Create the decayed adagrad optimizer op + decayed_adagrad_op = block.append_op( type=self.type, - inputs=inputs, - outputs=outputs, - attrs={ - "epsilon": self._epsilon, - "decay": self._rho, - "momentum": self._momentum, - "centered": self._centered, - "multi_precision": find_master, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Moment": moment_acc, + "LearningRate": self._create_param_lr(param_and_grad), + }, + outputs={ + "ParamOut": param_and_grad[0], + "MomentOut": moment_acc, }, + attrs={"epsilon": self._epsilon, "decay": self._decay}, stop_gradient=True, ) - return rmsprop_op + return decayed_adagrad_op class FtrlOptimizer(Optimizer): @@ -3689,202 +2840,6 @@ class FtrlOptimizer(Optimizer): return ftrl_op -class LambOptimizer(AdamOptimizer): - r""" - LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer. - - LAMB Optimizer is designed to scale up the batch size of training without losing - accuracy, which supports adaptive element-wise updating and accurate layer-wise - correction. For more information, please refer to `Large Batch Optimization for - Deep Learning: Training BERT in 76 minutes `_ . - - The updating of parameters follows: - - .. math:: - - m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t - - v_t &= \\beta_2 v_{t - 1} + (1 - \\beta_2)g_t^2 - - m_t &= \\frac{m_t}{\\beta_1^t} - - v_t &= \\frac{v_t}{\\beta_2^t} - - r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon} - - w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1}) - - - where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the - learning rate, :math:`\\lambda` the LAMB weight decay rate. - - Args: - learning_rate (float|Variable, optional): the learning rate used to update parameters. \ - Can be a float value or a Variable with data type float32. Default 0.001. - lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01. - beta1 (float, optional): The exponential decay rate for the 1st moment estimates. - Default 0.9. - beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. - Default 0.999. - epsilon (float, optional): A small float value for numerical stability. Default 1e-6. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static graph mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_paddle_fluid_clip_ClipGradByGlobalNorm` , :ref:`api_paddle_fluid_clip_ClipGradByNorm` , - :ref:`api_paddle_fluid_clip_ClipGradByValue` ). If you want better convergence, it is recommended - to use :ref:`api_paddle_fluid_clip_ClipGradByGlobalNorm` . Default None, meaning there is no gradient clipping. - exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight - decay when **exclude_from_weight_decay_fn(parameter)** returns true. - Default None. - name(str|None): For detailed information, please refer to - :ref:`api_guide_Name` . Usually name is no need to set and None by default. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - paddle.enable_static() - - data = paddle.static.data(name='x', shape=[-1, 5], dtype='float32') - hidden = paddle.static.nn.fc(x=data, size=10) - cost = paddle.mean(hidden) - - def exclude_fn(param): - return param.name.endswith('.b_0') - - optimizer = fluid.optimizer.Lamb(learning_rate=0.002, - exclude_from_weight_decay_fn=exclude_fn) - optimizer.minimize(cost) - """ - _moment1_acc_str = "moment1" - _moment2_acc_str = "moment2" - _beta1_pow_acc_str = "beta1_pow_acc" - _beta2_pow_acc_str = "beta2_pow_acc" - - def __init__( - self, - learning_rate=0.001, - lamb_weight_decay=0.01, - beta1=0.9, - beta2=0.999, - epsilon=1e-6, - parameter_list=None, - regularization=None, - grad_clip=None, - exclude_from_weight_decay_fn=None, - name=None, - ): - assert learning_rate is not None - assert lamb_weight_decay is not None - assert beta1 is not None - assert beta2 is not None - assert epsilon is not None - super().__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - beta1=beta1, - beta2=beta2, - epsilon=epsilon, - name=name, - ) - self.type = "lamb" - self._weight_decay = lamb_weight_decay - self._exclude_from_weight_decay_fn = exclude_from_weight_decay_fn - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - block.program._use_lamb = True - - moment1 = self._get_accumulator( - self._moment1_acc_str, param_and_grad[0] - ) - moment2 = self._get_accumulator( - self._moment2_acc_str, param_and_grad[0] - ) - beta1_pow_acc = self._get_accumulator( - self._beta1_pow_acc_str, param_and_grad[0] - ) - beta2_pow_acc = self._get_accumulator( - self._beta2_pow_acc_str, param_and_grad[0] - ) - - if ( - self._exclude_from_weight_decay_fn is not None - and self._exclude_from_weight_decay_fn(param_and_grad[0]) - ): - weight_decay = 0.0 - else: - weight_decay = self._weight_decay - lr = self._create_param_lr(param_and_grad) - master_weight = None - if in_dygraph_mode(): - _legacy_C_ops.lamb( - param_and_grad[0], - param_and_grad[1], - lr, - moment1, - moment2, - beta1_pow_acc, - beta2_pow_acc, - master_weight, - param_and_grad[0], - moment1, - moment2, - beta1_pow_acc, - beta2_pow_acc, - master_weight, - 'beta1', - self._beta1, - 'beta2', - self._beta2, - 'epsilon', - self._epsilon, - 'weight_decay', - weight_decay, - ) - return None - - # create the lamb optimize op - lamb_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "LearningRate": lr, - "Moment1": moment1, - "Moment2": moment2, - "Beta1Pow": beta1_pow_acc, - "Beta2Pow": beta2_pow_acc, - }, - outputs={ - "ParamOut": param_and_grad[0], - "Moment1Out": moment1, - "Moment2Out": moment2, - "Beta1PowOut": beta1_pow_acc, - "Beta2PowOut": beta2_pow_acc, - }, - attrs={ - "beta1": self._beta1, - "beta2": self._beta2, - "epsilon": self._epsilon, - "weight_decay": weight_decay, - }, - stop_gradient=True, - ) - - return lamb_op - - # We short the class name, since users will use the optimizer with the package # name. The sample code: # @@ -3895,16 +2850,11 @@ class LambOptimizer(AdamOptimizer): # It is no need to add an `Optimizer` as the class suffix SGD = SGDOptimizer Momentum = MomentumOptimizer -Adagrad = AdagradOptimizer Adam = AdamOptimizer -Adamax = AdamaxOptimizer Dpsgd = DpsgdOptimizer DecayedAdagrad = DecayedAdagradOptimizer -Adadelta = AdadeltaOptimizer -RMSProp = RMSPropOptimizer Ftrl = FtrlOptimizer LarsMomentum = LarsMomentumOptimizer -Lamb = LambOptimizer class ModelAverage(Optimizer): diff --git a/python/setup.py.in b/python/setup.py.in index 60d2a6a1d6f..084b3f3af7d 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -426,7 +426,6 @@ packages=['paddle', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', - 'paddle.fluid.contrib', 'paddle.fluid.incubate', 'paddle.incubate.distributed.fleet', 'paddle.fluid.incubate.checkpoint', diff --git a/setup.py b/setup.py index d5de08adb24..42deaed19b9 100644 --- a/setup.py +++ b/setup.py @@ -1430,7 +1430,6 @@ def get_setup_parameters(): 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', - 'paddle.fluid.contrib', 'paddle.fluid.incubate', 'paddle.incubate.distributed.fleet', 'paddle.fluid.incubate.checkpoint', diff --git a/test/amp/test_amp_api.py b/test/amp/test_amp_api.py index acb47d3302e..7c2a1f87069 100644 --- a/test/amp/test_amp_api.py +++ b/test/amp/test_amp_api.py @@ -70,7 +70,7 @@ class TestStaticDecorate(AmpTestBase): ) out = model(x) loss = paddle.mean(out) - optimizer = paddle.fluid.optimizer.Adadelta(learning_rate=0.001) + optimizer = paddle.optimizer.Adadelta(learning_rate=0.001) optimizer = paddle.static.amp.decorate( optimizer, init_loss_scaling=128.0, diff --git a/test/book/notest_understand_sentiment.py b/test/book/notest_understand_sentiment.py index 6d43dfb3d8a..eb4e01ae294 100644 --- a/test/book/notest_understand_sentiment.py +++ b/test/book/notest_understand_sentiment.py @@ -84,7 +84,7 @@ def train( else: raise NotImplementedError() - adagrad = fluid.optimizer.Adagrad(learning_rate=0.002) + adagrad = paddle.optimizer.Adagrad(learning_rate=0.002) adagrad.minimize(cost) train_data = paddle.batch( diff --git a/test/contrib/test_image_classification_fp16.py b/test/contrib/test_image_classification_fp16.py index 7a13621e956..9d192a1c76d 100644 --- a/test/contrib/test_image_classification_fp16.py +++ b/test/contrib/test_image_classification_fp16.py @@ -139,7 +139,7 @@ def train(net_type, use_cuda, save_dirname, is_local): # Test program test_program = train_program.clone(for_test=True) - optimizer = fluid.optimizer.Lamb(learning_rate=0.001) + optimizer = paddle.optimizer.Lamb(learning_rate=0.001) amp_lists = paddle.static.amp.AutoMixedPrecisionLists( custom_black_varnames={"loss", "conv2d_0.w_0"} @@ -513,7 +513,7 @@ class TestAmpWithNonIterableDataLoader(unittest.TestCase): ) avg_cost = paddle.mean(cost) - optimizer = fluid.optimizer.Lamb(learning_rate=0.001) + optimizer = paddle.optimizer.Lamb(learning_rate=0.001) amp_lists = paddle.static.amp.AutoMixedPrecisionLists( custom_black_varnames={"loss", "conv2d_0.w_0"} ) diff --git a/test/dygraph_to_static/test_reinforcement_learning.py b/test/dygraph_to_static/test_reinforcement_learning.py index 9a50e5e7f44..e8aa1dbf9bf 100644 --- a/test/dygraph_to_static/test_reinforcement_learning.py +++ b/test/dygraph_to_static/test_reinforcement_learning.py @@ -73,8 +73,8 @@ def train(args, place, to_static): policy = Policy() eps = np.finfo(np.float32).eps.item() - optimizer = fluid.optimizer.AdamaxOptimizer( - learning_rate=1e-2, parameter_list=policy.parameters() + optimizer = paddle.optimizer.Adamax( + learning_rate=1e-2, parameters=policy.parameters() ) def get_mean_and_std(values=[]): diff --git a/test/dygraph_to_static/test_sentiment.py b/test/dygraph_to_static/test_sentiment.py index 3ad8ce63334..3da23c08492 100644 --- a/test/dygraph_to_static/test_sentiment.py +++ b/test/dygraph_to_static/test_sentiment.py @@ -328,8 +328,8 @@ def train(args, to_static): model = GRU(args.vocab_size, args.batch_size, args.padding_size) elif args.model_type == 'bigru_net': model = BiGRU(args.vocab_size, args.batch_size, args.padding_size) - sgd_optimizer = fluid.optimizer.Adagrad( - learning_rate=args.lr, parameter_list=model.parameters() + sgd_optimizer = paddle.optimizer.Adagrad( + learning_rate=args.lr, parameters=model.parameters() ) loss_data = [] diff --git a/test/legacy_test/fleet_ps_training.py b/test/legacy_test/fleet_ps_training.py index 50d43b6c00a..773448b417c 100644 --- a/test/legacy_test/fleet_ps_training.py +++ b/test/legacy_test/fleet_ps_training.py @@ -30,7 +30,7 @@ with fluid.device_guard("gpu"): input_y = paddle.cast(input_y, dtype="int64") cost = mlp(input_x, input_y) -optimizer = fluid.optimizer.Adagrad(learning_rate=0.01) +optimizer = paddle.optimizer.Adagrad(learning_rate=0.01) role = role_maker.PaddleCloudRoleMaker() fleet.init(role) diff --git a/test/legacy_test/test_adadelta_op.py b/test/legacy_test/test_adadelta_op.py index 14e791ce18f..42080a4280b 100644 --- a/test/legacy_test/test_adadelta_op.py +++ b/test/legacy_test/test_adadelta_op.py @@ -403,118 +403,5 @@ class TestAdadeltaMultiPrecision2_0(unittest.TestCase): ) -class TestAdadeltaMultiPrecision1_0(unittest.TestCase): - def dygraph_adadelta_mp(self, use_amp, mp): - paddle.disable_static() - paddle.seed(10) - paddle.set_device('gpu') - input = paddle.randn((2, 2)) - model = paddle.nn.Linear(2, 2) - optimizer = paddle.fluid.optimizer.Adadelta( - learning_rate=0.001, - parameter_list=model.parameters(), - ) - optimizer._multi_precision = mp - if use_amp: - model = paddle.amp.decorate(models=model, level='O2') - scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - - for idx in range(5): - if use_amp: - with paddle.amp.auto_cast(level='O2'): - output = model(input) - loss = paddle.mean(output) - scaled = scaler.scale(loss) - scaled.backward() - scaler.minimize(optimizer, scaled) - optimizer.clear_gradients() - else: - output = model(input) - loss = paddle.mean(output) - optimizer.minimize(loss) - optimizer.clear_gradients() - - return output, model.parameters() - - def static_adadelta_mp(self, use_amp, mp): - paddle.enable_static() - paddle.seed(100) - np.random.seed(100) - exe = paddle.static.Executor('gpu') - train_program = paddle.static.Program() - startup_program = paddle.static.Program() - optimizer = paddle.fluid.optimizer.Adadelta(learning_rate=0.001) - optimizer._multi_precision = mp - - if use_amp: - optimizer = paddle.static.amp.decorate( - optimizer, - init_loss_scaling=128.0, - use_dynamic_loss_scaling=True, - use_pure_fp16=True, - use_fp16_guard=False, - ) - with paddle.static.program_guard(train_program, startup_program): - if use_amp: - data = paddle.static.data( - shape=[2, 2], name='X', dtype='float16' - ) - else: - data = paddle.static.data( - shape=[2, 2], name='X', dtype='float32' - ) - hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.mean(hidden) - optimizer.minimize(loss) - exe.run(startup_program) - - if use_amp: - optimizer.amp_init( - place=paddle.CUDAPlace(0), scope=paddle.static.global_scope() - ) - x = np.random.random(size=(2, 2)).astype('float16') - else: - x = np.random.random(size=(2, 2)).astype('float32') - out = [] - for idx in range(5): - (loss_data,) = exe.run( - train_program, feed={"X": x}, fetch_list=[loss.name] - ) - out.append(loss_data) - return out - - def test_main(self): - if not paddle.is_compiled_with_cuda(): - return - "Test dygraph mode" - output1_dy, params1_dy = self.dygraph_adadelta_mp(use_amp=True, mp=True) - output2_dy, params2_dy = self.dygraph_adadelta_mp( - use_amp=False, mp=False - ) - np.testing.assert_allclose( - output1_dy.astype('float32').numpy(), - output2_dy.astype('float32').numpy(), - rtol=1e-05, - atol=0.1, - ) - for idx in range(len(params1_dy)): - np.testing.assert_allclose( - params1_dy[idx].astype('float32').numpy(), - params2_dy[idx].astype('float32').numpy(), - rtol=1e-05, - atol=0.1, - ) - "Test static mode" - output1_st = self.static_adadelta_mp(use_amp=True, mp=True) - output2_st = self.static_adadelta_mp(use_amp=False, mp=False) - for idx in range(len(output1_st)): - np.testing.assert_allclose( - output1_st[idx].astype('float32'), - output2_st[idx].astype('float32'), - rtol=1e-05, - atol=0.1, - ) - - if __name__ == "__main__": unittest.main() diff --git a/test/legacy_test/test_adagrad_op.py b/test/legacy_test/test_adagrad_op.py index d89e9233b33..66f91de028d 100644 --- a/test/legacy_test/test_adagrad_op.py +++ b/test/legacy_test/test_adagrad_op.py @@ -369,117 +369,6 @@ class TestAdagradMultiPrecision2_0(unittest.TestCase): ) -class TestAdagradMultiPrecision1_0(unittest.TestCase): - def dygraph_adagrad_mp(self, use_amp, mp): - paddle.disable_static() - paddle.seed(10) - paddle.set_device('gpu') - input = paddle.randn((2, 2)) - model = paddle.nn.Linear(2, 2) - optimizer = paddle.fluid.optimizer.Adagrad( - learning_rate=0.001, parameter_list=model.parameters() - ) - optimizer._multi_precision = mp - if use_amp: - model = paddle.amp.decorate(models=model, level='O2') - scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - - for idx in range(5): - if use_amp: - with paddle.amp.auto_cast(level='O2'): - output = model(input) - loss = paddle.mean(output) - scaled = scaler.scale(loss) - scaled.backward() - scaler.minimize(optimizer, scaled) - optimizer.clear_gradients() - else: - output = model(input) - loss = paddle.mean(output) - optimizer.minimize(loss) - optimizer.clear_gradients() - - return output, model.parameters() - - def static_adagrad_mp(self, use_amp, mp): - paddle.enable_static() - paddle.seed(100) - np.random.seed(100) - exe = paddle.static.Executor('gpu') - train_program = paddle.static.Program() - startup_program = paddle.static.Program() - optimizer = paddle.fluid.optimizer.Adagrad(learning_rate=0.001) - optimizer._multi_precision = mp - if use_amp: - optimizer = paddle.static.amp.decorate( - optimizer, - init_loss_scaling=128.0, - use_dynamic_loss_scaling=True, - use_pure_fp16=True, - use_fp16_guard=False, - ) - with paddle.static.program_guard(train_program, startup_program): - if use_amp: - data = paddle.static.data( - shape=[2, 2], name='X', dtype='float16' - ) - else: - data = paddle.static.data( - shape=[2, 2], name='X', dtype='float32' - ) - hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.mean(hidden) - optimizer.minimize(loss) - exe.run(startup_program) - - if use_amp: - optimizer.amp_init( - place=paddle.CUDAPlace(0), scope=paddle.static.global_scope() - ) - x = np.random.random(size=(2, 2)).astype('float16') - else: - x = np.random.random(size=(2, 2)).astype('float32') - out = [] - for idx in range(5): - (loss_data,) = exe.run( - train_program, feed={"X": x}, fetch_list=[loss.name] - ) - out.append(loss_data) - return out - - def test_main(self): - if not paddle.is_compiled_with_cuda(): - return - "Test dygraph mode" - output1_dy, params1_dy = self.dygraph_adagrad_mp(use_amp=True, mp=True) - output2_dy, params2_dy = self.dygraph_adagrad_mp( - use_amp=False, mp=False - ) - np.testing.assert_allclose( - output1_dy.astype('float32').numpy(), - output2_dy.astype('float32').numpy(), - rtol=1e-05, - atol=0.1, - ) - for idx in range(len(params1_dy)): - np.testing.assert_allclose( - params1_dy[idx].astype('float32').numpy(), - params2_dy[idx].astype('float32').numpy(), - rtol=1e-05, - atol=0.1, - ) - "Test static mode" - output1_st = self.static_adagrad_mp(use_amp=True, mp=True) - output2_st = self.static_adagrad_mp(use_amp=False, mp=False) - for idx in range(len(output1_st)): - np.testing.assert_allclose( - output1_st[idx].astype('float32'), - output2_st[idx].astype('float32'), - rtol=1e-05, - atol=0.1, - ) - - if __name__ == "__main__": paddle.enable_static() unittest.main() diff --git a/test/legacy_test/test_adamax_op.py b/test/legacy_test/test_adamax_op.py index a473b4fece8..c12bcfd994f 100644 --- a/test/legacy_test/test_adamax_op.py +++ b/test/legacy_test/test_adamax_op.py @@ -397,114 +397,5 @@ class TestAdamaxMultiPrecision2_0(unittest.TestCase): ) -class TestAdamaxMultiPrecision1_0(unittest.TestCase): - def dygraph_adamax_mp(self, use_amp, mp): - paddle.disable_static() - paddle.seed(10) - paddle.set_device('gpu') - input = paddle.randn((2, 2)) - model = paddle.nn.Linear(2, 2) - optimizer = paddle.fluid.optimizer.Adamax( - learning_rate=0.001, parameter_list=model.parameters() - ) - optimizer._multi_precision = mp - if use_amp: - model = paddle.amp.decorate(models=model, level='O2') - scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - - for idx in range(5): - if use_amp: - with paddle.amp.auto_cast(level='O2'): - output = model(input) - loss = paddle.mean(output) - scaled = scaler.scale(loss) - scaled.backward() - scaler.minimize(optimizer, scaled) - optimizer.clear_gradients() - else: - output = model(input) - loss = paddle.mean(output) - optimizer.minimize(loss) - optimizer.clear_gradients() - - return output, model.parameters() - - def static_adamax_mp(self, use_amp, mp): - paddle.enable_static() - paddle.seed(100) - np.random.seed(100) - exe = paddle.static.Executor('gpu') - train_program = paddle.static.Program() - startup_program = paddle.static.Program() - optimizer = paddle.fluid.optimizer.Adamax(learning_rate=0.001) - optimizer._multi_precision = mp - if use_amp: - optimizer = paddle.static.amp.decorate( - optimizer, - init_loss_scaling=128.0, - use_dynamic_loss_scaling=True, - use_pure_fp16=True, - use_fp16_guard=False, - ) - with paddle.static.program_guard(train_program, startup_program): - if use_amp: - data = paddle.static.data( - shape=[2, 2], name='X', dtype='float16' - ) - else: - data = paddle.static.data( - shape=[2, 2], name='X', dtype='float32' - ) - hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.mean(hidden) - optimizer.minimize(loss) - exe.run(startup_program) - - if use_amp: - optimizer.amp_init( - place=paddle.CUDAPlace(0), scope=paddle.static.global_scope() - ) - x = np.random.random(size=(2, 2)).astype('float16') - else: - x = np.random.random(size=(2, 2)).astype('float32') - out = [] - for idx in range(5): - (loss_data,) = exe.run( - train_program, feed={"X": x}, fetch_list=[loss.name] - ) - out.append(loss_data) - return out - - def test_main(self): - if not paddle.is_compiled_with_cuda(): - return - "Test dygraph mode" - output1_dy, params1_dy = self.dygraph_adamax_mp(use_amp=True, mp=True) - output2_dy, params2_dy = self.dygraph_adamax_mp(use_amp=False, mp=False) - np.testing.assert_allclose( - output1_dy.astype('float32').numpy(), - output2_dy.astype('float32').numpy(), - rtol=1e-05, - atol=0.1, - ) - for idx in range(len(params1_dy)): - np.testing.assert_allclose( - params1_dy[idx].astype('float32').numpy(), - params2_dy[idx].astype('float32').numpy(), - rtol=1e-05, - atol=0.1, - ) - "Test static mode" - output1_st = self.static_adamax_mp(use_amp=True, mp=True) - output2_st = self.static_adamax_mp(use_amp=False, mp=False) - for idx in range(len(output1_st)): - np.testing.assert_allclose( - output1_st[idx].astype('float32'), - output2_st[idx].astype('float32'), - rtol=1e-05, - atol=0.1, - ) - - if __name__ == "__main__": unittest.main() diff --git a/test/legacy_test/test_case.py b/test/legacy_test/test_case.py index 4dadbb72df9..e6323f68312 100644 --- a/test/legacy_test/test_case.py +++ b/test/legacy_test/test_case.py @@ -614,7 +614,7 @@ class TestMutiTask(unittest.TestCase): one = paddle.tensor.fill_constant(shape=[1], dtype='int32', value=1) adam = optimizer.Adam(learning_rate=0.001) - adagrad = optimizer.Adagrad(learning_rate=0.001) + adagrad = paddle.optimizer.Adagrad(learning_rate=0.001) def fn_1(): sum = paddle.multiply(x, y) diff --git a/test/legacy_test/test_eager_deletion_dynamic_rnn_base.py b/test/legacy_test/test_eager_deletion_dynamic_rnn_base.py index 134bf926608..5716eaa8b03 100644 --- a/test/legacy_test/test_eager_deletion_dynamic_rnn_base.py +++ b/test/legacy_test/test_eager_deletion_dynamic_rnn_base.py @@ -42,7 +42,7 @@ def train(network, use_cuda, batch_size=32, pass_num=2): cost = network(data, label, word_dict_size) cost.persistable = True - optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) + optimizer = paddle.optimizer.Adagrad(learning_rate=0.2) optimizer.minimize(cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() diff --git a/test/legacy_test/test_imperative_optimizer.py b/test/legacy_test/test_imperative_optimizer.py index 2bc9107bc2a..ad21bb1b145 100644 --- a/test/legacy_test/test_imperative_optimizer.py +++ b/test/legacy_test/test_imperative_optimizer.py @@ -23,22 +23,17 @@ from paddle import fluid from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer from paddle.fluid import core from paddle.fluid.optimizer import ( - AdadeltaOptimizer, - AdagradOptimizer, Adam, - AdamaxOptimizer, DecayedAdagradOptimizer, DpsgdOptimizer, ExponentialMovingAverage, FtrlOptimizer, - LambOptimizer, LarsMomentumOptimizer, LookaheadOptimizer, ModelAverage, MomentumOptimizer, PipelineOptimizer, RecomputeOptimizer, - RMSPropOptimizer, SGDOptimizer, ) @@ -593,13 +588,13 @@ class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase): class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = AdagradOptimizer( - learning_rate=0.2, parameter_list=parameter_list + optimizer = paddle.optimizer.Adagrad( + learning_rate=0.2, parameters=parameter_list ) return optimizer def get_optimizer(self): - optimizer = AdagradOptimizer(learning_rate=0.2) + optimizer = paddle.optimizer.Adagrad(learning_rate=0.2) return optimizer def test_adagrad(self): @@ -608,13 +603,13 @@ class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = AdamaxOptimizer( - learning_rate=0.2, parameter_list=parameter_list + optimizer = paddle.optimizer.Adamax( + learning_rate=0.2, parameters=parameter_list ) return optimizer def get_optimizer(self): - optimizer = AdamaxOptimizer(learning_rate=0.2) + optimizer = paddle.optimizer.Adamax(learning_rate=0.2) return optimizer def test_adamax(self): @@ -661,16 +656,16 @@ class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase): class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = AdadeltaOptimizer( + optimizer = paddle.optimizer.Adadelta( learning_rate=0.0003, epsilon=1.0e-6, rho=0.95, - parameter_list=parameter_list, + parameters=parameter_list, ) return optimizer def get_optimizer(self): - optimizer = AdadeltaOptimizer( + optimizer = paddle.optimizer.Adadelta( learning_rate=0.0003, epsilon=1.0e-6, rho=0.95 ) return optimizer @@ -681,13 +676,13 @@ class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = RMSPropOptimizer( - learning_rate=0.1, parameter_list=parameter_list + optimizer = paddle.optimizer.RMSProp( + learning_rate=0.1, parameters=parameter_list ) return optimizer def get_optimizer(self): - optimizer = RMSPropOptimizer(learning_rate=0.1) + optimizer = paddle.optimizer.RMSProp(learning_rate=0.1) return optimizer def test_rmsprop(self): @@ -715,15 +710,15 @@ def exclude_fn(param): class TestImperativeLambOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = LambOptimizer( + optimizer = paddle.optimizer.Lamb( learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn, - parameter_list=parameter_list, + parameters=parameter_list, ) return optimizer def get_optimizer(self): - optimizer = LambOptimizer( + optimizer = paddle.optimizer.Lamb( learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn ) return optimizer diff --git a/test/legacy_test/test_imperative_optimizer_v2.py b/test/legacy_test/test_imperative_optimizer_v2.py index 83b3b56d7e3..b9cfe1e0132 100644 --- a/test/legacy_test/test_imperative_optimizer_v2.py +++ b/test/legacy_test/test_imperative_optimizer_v2.py @@ -23,9 +23,6 @@ from paddle import fluid from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer from paddle.fluid import core from paddle.fluid.optimizer import ( - AdadeltaOptimizer, - AdagradOptimizer, - AdamaxOptimizer, DecayedAdagradOptimizer, DpsgdOptimizer, ExponentialMovingAverage, @@ -36,7 +33,6 @@ from paddle.fluid.optimizer import ( MomentumOptimizer, PipelineOptimizer, RecomputeOptimizer, - RMSPropOptimizer, ) # Note(wangzhongpu) @@ -721,13 +717,13 @@ class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase): class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = AdagradOptimizer( - learning_rate=0.2, parameter_list=parameter_list + optimizer = paddle.optimizer.Adagrad( + learning_rate=0.2, parameters=parameter_list ) return optimizer def get_optimizer(self): - optimizer = AdagradOptimizer(learning_rate=0.2) + optimizer = paddle.optimizer.Adagrad(learning_rate=0.2) return optimizer def test_adagrad(self): @@ -736,13 +732,13 @@ class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = AdamaxOptimizer( - learning_rate=0.2, parameter_list=parameter_list + optimizer = paddle.optimizer.Adamax( + learning_rate=0.2, parameters=parameter_list ) return optimizer def get_optimizer(self): - optimizer = AdamaxOptimizer(learning_rate=0.2) + optimizer = paddle.optimizer.Adamax(learning_rate=0.2) return optimizer def test_adamax(self): @@ -789,16 +785,16 @@ class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase): class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = AdadeltaOptimizer( + optimizer = paddle.optimizer.Adadelta( learning_rate=0.0003, epsilon=1.0e-6, rho=0.95, - parameter_list=parameter_list, + parameters=parameter_list, ) return optimizer def get_optimizer(self): - optimizer = AdadeltaOptimizer( + optimizer = paddle.optimizer.Adadelta( learning_rate=0.0003, epsilon=1.0e-6, rho=0.95 ) return optimizer @@ -809,13 +805,13 @@ class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = RMSPropOptimizer( - learning_rate=0.1, parameter_list=parameter_list + optimizer = paddle.optimizer.RMSProp( + learning_rate=0.1, parameters=parameter_list ) return optimizer def get_optimizer(self): - optimizer = RMSPropOptimizer(learning_rate=0.1) + optimizer = paddle.optimizer.RMSProp(learning_rate=0.1) return optimizer def test_rmsprop(self): diff --git a/test/legacy_test/test_momentum_op.py b/test/legacy_test/test_momentum_op.py index acb2e76af4d..d59cfab7d17 100644 --- a/test/legacy_test/test_momentum_op.py +++ b/test/legacy_test/test_momentum_op.py @@ -677,11 +677,11 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase): out = linear(inp) loss = paddle.mean(out) # This can be any optimizer supported by dygraph. - momentum = paddle.fluid.contrib.optimizer.Momentum( + momentum = paddle.optimizer.Momentum( learning_rate=0.01, momentum=0.9, - parameter_list=linear.parameters(), - regularization=regularization, + parameters=linear.parameters(), + weight_decay=regularization, ) momentum.minimize(loss) @@ -703,7 +703,7 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase): ) avg_cost = paddle.mean(cost) - momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum( + momentum_optimizer = paddle.optimizer.Momentum( learning_rate=0.1, momentum=0.9 ) momentum_optimizer.minimize(avg_cost) @@ -833,11 +833,11 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase): weight_attr=paddle.nn.initializer.Constant(value=2.0), bias_attr=paddle.nn.initializer.Constant(value=2.0), ) - momentum_new = paddle.fluid.contrib.optimizer.Momentum( + momentum_new = paddle.optimizer.Momentum( learning_rate=0.01, momentum=0.9, - parameter_list=linear_new.parameters(), - regularization=paddle.regularizer.L2Decay(coeff=0.1), + parameters=linear_new.parameters(), + weight_decay=paddle.regularizer.L2Decay(coeff=0.1), ) self.__update_params(momentum=momentum_new, linear=linear_new) diff --git a/test/legacy_test/test_optimizer.py b/test/legacy_test/test_optimizer.py index b92a1972982..00babb8ad9d 100644 --- a/test/legacy_test/test_optimizer.py +++ b/test/legacy_test/test_optimizer.py @@ -248,72 +248,6 @@ class TestMomentumOptimizer(unittest.TestCase): self.assertAlmostEqual(init_ops[0].attr('value'), 0.0) -class TestAdagradOptimizer(unittest.TestCase): - class MockAdagrad(optimizer.AdagradOptimizer): - def get_accumulators(self): - return self._accumulators - - def get_moment_str(self): - return self._moment_acc_str - - def test_adagrad_optimizer(self): - init_program = framework.Program() - program = framework.Program() - block = program.global_block() - mul_x = block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="mul.x", - optimize_attr={'learning_rate': 1.1}, - ) - mul_y = block.create_var( - dtype="float32", shape=[10, 8], lod_level=0, name="mul.y" - ) - mul_out = block.create_var( - dtype="float32", shape=[5, 8], lod_level=0, name="mul.out" - ) - block.append_op( - type="mul", - inputs={"X": mul_x, "Y": mul_y}, - outputs={"Out": mul_out}, - attrs={"x_num_col_dims": 1}, - ) - mean_out = block.create_var( - dtype="float32", shape=[1], lod_level=0, name="mean.out" - ) - block.append_op( - type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out} - ) - learning_rate = 0.01 - adagrad_optimizer = self.MockAdagrad( - learning_rate=learning_rate, epsilon=1.0e-6 - ) - params_grads = append_backward(mean_out) - self.assertEqual(len(params_grads), 1) - self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) - with framework.program_guard(program, init_program): - opts = adagrad_optimizer.apply_gradients(params_grads) - self.assertEqual(len(opts), 2) - self.assertEqual([op.type for op in opts], ["scale", "adagrad"]) - - # Check accumulators - accumulators = adagrad_optimizer.get_accumulators() - self.assertEqual(len(accumulators), 1) - self.assertTrue(adagrad_optimizer.get_moment_str() in accumulators) - moment_acc = accumulators[adagrad_optimizer.get_moment_str()] - self.assertEqual(len(moment_acc), 1) - self.assertTrue(mul_x.name in moment_acc) - - # Check init_program - init_ops = init_program.global_block().ops - self.assertEqual(len(init_ops), 2) - self.assertEqual(init_ops[1].type, "fill_constant") - self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate) - self.assertEqual(init_ops[0].type, "fill_constant") - self.assertAlmostEqual(init_ops[0].attr('value'), 0.0) - - class TestAdamOptimizer(unittest.TestCase): class MockAdam(optimizer.AdamOptimizer): def get_accumulators(self): @@ -385,77 +319,6 @@ class TestAdamOptimizer(unittest.TestCase): self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate) -class TestAdamaxOptimizer(unittest.TestCase): - class MockAdamax(optimizer.AdamaxOptimizer): - def get_accumulators(self): - return self._accumulators - - def get_moment_str(self): - return self._moment_acc_str - - def get_inf_norm_str(self): - return self._inf_norm_acc_str - - def test_adamax_optimizer(self): - init_program = framework.Program() - program = framework.Program() - block = program.global_block() - mul_x = block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="mul.x", - optimize_attr={'learning_rate': 1.1}, - ) - mul_y = block.create_var( - dtype="float32", shape=[10, 8], lod_level=0, name="mul.y" - ) - mul_out = block.create_var( - dtype="float32", shape=[5, 8], lod_level=0, name="mul.out" - ) - block.append_op( - type="mul", - inputs={"X": mul_x, "Y": mul_y}, - outputs={"Out": mul_out}, - attrs={"x_num_col_dims": 1}, - ) - mean_out = block.create_var( - dtype="float32", shape=[1], lod_level=0, name="mean.out" - ) - block.append_op( - type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out} - ) - learning_rate = 0.01 - adamax_optimizer = self.MockAdamax( - learning_rate=learning_rate, beta1=0.9, beta2=0.999 - ) - params_grads = append_backward(mean_out) - self.assertEqual(len(params_grads), 1) - self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) - with framework.program_guard(program, init_program): - opts = adamax_optimizer.apply_gradients(params_grads) - self.assertEqual(len(opts), 3) - self.assertEqual([op.type for op in opts], ["scale", "adamax", "scale"]) - - # Check accumulators - accumulators = adamax_optimizer.get_accumulators() - self.assertEqual(len(accumulators), 3) - self.assertTrue(adamax_optimizer.get_moment_str() in accumulators) - self.assertTrue(adamax_optimizer.get_inf_norm_str() in accumulators) - moment_acc = accumulators[adamax_optimizer.get_moment_str()] - inf_norm_acc = accumulators[adamax_optimizer.get_inf_norm_str()] - self.assertEqual(len(moment_acc), 1) - self.assertEqual(len(inf_norm_acc), 1) - self.assertTrue(mul_x.name in moment_acc) - self.assertTrue(mul_x.name in inf_norm_acc) - - # Check init_program - init_ops = init_program.global_block().ops - self.assertEqual(len(init_ops), 4) - self.assertEqual(init_ops[-1].type, "fill_constant") - self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate) - - class TestDpsgdOptimizer(unittest.TestCase): def test_dpsgd_optimizer(self): def check_dpsgd_optimizer(optimizer_attr): diff --git a/test/legacy_test/test_regularizer.py b/test/legacy_test/test_regularizer.py index 74826ab9d44..af7103d704f 100644 --- a/test/legacy_test/test_regularizer.py +++ b/test/legacy_test/test_regularizer.py @@ -203,9 +203,9 @@ class TestRegularizer(unittest.TestCase): avg_cost = model(data, label, self.word_len) - optimizer = fluid.optimizer.Adagrad( + optimizer = paddle.optimizer.Adagrad( learning_rate=0.1, - regularization=paddle.regularizer.L2Decay(1.0), + weight_decay=paddle.regularizer.L2Decay(1.0), ) optimizer.minimize(avg_cost) param_sum = self.run_program(place, [data, label]) @@ -236,7 +236,7 @@ class TestRegularizer(unittest.TestCase): para_sum.append(paddle.sum(para_mul)) avg_cost_l2 += paddle.add_n(para_sum) * 0.5 - optimizer = fluid.optimizer.Adagrad(learning_rate=0.1) + optimizer = paddle.optimizer.Adagrad(learning_rate=0.1) optimizer.minimize(avg_cost_l2) param_sum = self.run_program(place, [data, label]) return param_sum diff --git a/test/legacy_test/test_regularizer_api.py b/test/legacy_test/test_regularizer_api.py index 27f9f6e6c40..415a5d963b1 100644 --- a/test/legacy_test/test_regularizer_api.py +++ b/test/legacy_test/test_regularizer_api.py @@ -116,9 +116,9 @@ class TestRegularizer(unittest.TestCase): avg_cost = model(data, label, self.word_len) - optimizer = fluid.optimizer.Adagrad( + optimizer = paddle.optimizer.Adagrad( learning_rate=0.1, - regularization=paddle.regularizer.L2Decay(1.0), + weight_decay=paddle.regularizer.L2Decay(1.0), ) optimizer.minimize(avg_cost) param_sum = self.run_program(place, [data, label]) @@ -149,7 +149,7 @@ class TestRegularizer(unittest.TestCase): para_sum.append(paddle.sum(para_mul)) avg_cost_l2 += paddle.add_n(para_sum) * 0.5 - optimizer = fluid.optimizer.Adagrad(learning_rate=0.1) + optimizer = paddle.optimizer.Adagrad(learning_rate=0.1) optimizer.minimize(avg_cost_l2) param_sum = self.run_program(place, [data, label]) return param_sum diff --git a/test/legacy_test/test_rmsprop_op.py b/test/legacy_test/test_rmsprop_op.py index 5f9579aaa07..8540e7cf3f2 100644 --- a/test/legacy_test/test_rmsprop_op.py +++ b/test/legacy_test/test_rmsprop_op.py @@ -521,119 +521,6 @@ class TestRMSPropMultiPrecision2_0(unittest.TestCase): ) -class TestRMSPropMultiPrecision1_0(unittest.TestCase): - def dygraph_rmsprop_mp(self, use_amp, mp): - paddle.disable_static() - paddle.seed(10) - paddle.set_device('gpu') - input = paddle.randn((2, 2)) - model = paddle.nn.Linear(2, 2) - optimizer = paddle.fluid.optimizer.RMSProp( - learning_rate=0.001, - parameter_list=model.parameters(), - ) - optimizer._multi_precision = mp - if use_amp: - model = paddle.amp.decorate(models=model, level='O2') - scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - - for idx in range(5): - if use_amp: - with paddle.amp.auto_cast(level='O2'): - output = model(input) - loss = paddle.mean(output) - scaled = scaler.scale(loss) - scaled.backward() - scaler.minimize(optimizer, scaled) - optimizer.clear_gradients() - else: - output = model(input) - loss = paddle.mean(output) - optimizer.minimize(loss) - optimizer.clear_gradients() - - return output, model.parameters() - - def static_rmsprop_mp(self, use_amp, mp): - paddle.enable_static() - paddle.seed(100) - np.random.seed(100) - exe = paddle.static.Executor('gpu') - train_program = paddle.static.Program() - startup_program = paddle.static.Program() - optimizer = paddle.fluid.optimizer.RMSProp(learning_rate=0.001) - optimizer._multi_precision = mp - - if use_amp: - optimizer = paddle.static.amp.decorate( - optimizer, - init_loss_scaling=128.0, - use_dynamic_loss_scaling=True, - use_pure_fp16=True, - use_fp16_guard=False, - ) - with paddle.static.program_guard(train_program, startup_program): - if use_amp: - data = paddle.static.data( - shape=[2, 2], name='X', dtype='float16' - ) - else: - data = paddle.static.data( - shape=[2, 2], name='X', dtype='float32' - ) - hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.mean(hidden) - optimizer.minimize(loss) - exe.run(startup_program) - - if use_amp: - optimizer.amp_init( - place=paddle.CUDAPlace(0), scope=paddle.static.global_scope() - ) - x = np.random.random(size=(2, 2)).astype('float16') - else: - x = np.random.random(size=(2, 2)).astype('float32') - out = [] - for idx in range(5): - (loss_data,) = exe.run( - train_program, feed={"X": x}, fetch_list=[loss.name] - ) - out.append(loss_data) - return out - - def test_main(self): - if not paddle.is_compiled_with_cuda(): - return - "Test dygraph mode" - output1_dy, params1_dy = self.dygraph_rmsprop_mp(use_amp=True, mp=True) - output2_dy, params2_dy = self.dygraph_rmsprop_mp( - use_amp=False, mp=False - ) - np.testing.assert_allclose( - output1_dy.astype('float32').numpy(), - output2_dy.astype('float32').numpy(), - rtol=1e-05, - atol=0.1, - ) - for idx in range(len(params1_dy)): - np.testing.assert_allclose( - params1_dy[idx].astype('float32').numpy(), - params2_dy[idx].astype('float32').numpy(), - rtol=1e-05, - atol=0.1, - ) - "Test static mode" - output1_st = self.static_rmsprop_mp(use_amp=True, mp=True) - output2_st = self.static_rmsprop_mp(use_amp=False, mp=False) - for idx in range(len(output1_st)): - np.testing.assert_allclose( - output1_st[idx].astype('float32'), - output2_st[idx].astype('float32'), - rtol=1e-05, - atol=0.1, - ) - - if __name__ == "__main__": paddle.enable_static() unittest.main() diff --git a/test/legacy_test/test_trainable.py b/test/legacy_test/test_trainable.py index b56813a3a92..9e1818c0d2d 100644 --- a/test/legacy_test/test_trainable.py +++ b/test/legacy_test/test_trainable.py @@ -73,8 +73,8 @@ class TestTrainable(unittest.TestCase): self.check_trainable( test_trainable, feed_dict, - op_count={'adamax': 1, 'scale': 1, 'mul_grad': 0}, - optimizer=fluid.optimizer.Adamax(learning_rate=0.2), + op_count={'adamax': 1, 'scale': 1, 'mul_grad': 1}, + optimizer=paddle.optimizer.Adamax(learning_rate=0.2), ) diff --git a/test/legacy_test/test_weight_decay.py b/test/legacy_test/test_weight_decay.py index cda827cea8c..ae85324e9d6 100644 --- a/test/legacy_test/test_weight_decay.py +++ b/test/legacy_test/test_weight_decay.py @@ -157,7 +157,7 @@ class TestWeightDecay(unittest.TestCase): for var in main_prog.block(0).all_parameters() ] - optimizer = fluid.optimizer.Adagrad( + optimizer = paddle.optimizer.Adagrad( learning_rate=self.learning_rate ) optimizer.minimize(avg_cost) -- GitLab