未验证 提交 94365855 编写于 作者: L LoneRanger 提交者: GitHub

replace the AdagradOptimizer...

replace the AdagradOptimizer 、adamaxOptimizer、AdadeltaOptimizer、RMSPropOptimizer、LambOptimizer and Momentum (#54152)

* replace the AdadeltaOptimizer with Adadelta

* replace the RMSPropOptimizer with RMSProp

* replace the LambOptimizer with lamb

* replace the momentum in contrib/optimizer.py with Momentum in python/paddle/optimizer/momentum.py

* fix bug

* fix bug

* fix bug

* fix bug of Lamp

* fix bug of Lamp

* fix bug of import

* replace the AdamaxOptimizer with Admax and change the optimizer base for AdagradOptimizer

* fix bug

* fix bug

* Update optimizer.py

* fix bug

* fix bug
上级 a1396a80
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
import logging import logging
import paddle
from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.optimizer import LambOptimizer as LAMB
from .meta_optimizer_base import MetaOptimizerBase from .meta_optimizer_base import MetaOptimizerBase
...@@ -55,14 +55,13 @@ class LambOptimizer(MetaOptimizerBase): ...@@ -55,14 +55,13 @@ class LambOptimizer(MetaOptimizerBase):
_exclude_from_weight_decay_fn = exclude_fn _exclude_from_weight_decay_fn = exclude_fn
self.lamb_opt = LAMB( self.lamb_opt = paddle.optimizer.Lamb(
learning_rate=opt._learning_rate, learning_rate=opt._learning_rate,
lamb_weight_decay=configs['lamb_weight_decay'], lamb_weight_decay=configs['lamb_weight_decay'],
beta1=opt._beta1, beta1=opt._beta1,
beta2=opt._beta2, beta2=opt._beta2,
epsilon=opt._epsilon, epsilon=opt._epsilon,
parameter_list=opt._parameter_list, parameters=opt._parameter_list,
regularization=opt.regularization,
grad_clip=opt._grad_clip, grad_clip=opt._grad_clip,
exclude_from_weight_decay_fn=_exclude_from_weight_decay_fn, exclude_from_weight_decay_fn=_exclude_from_weight_decay_fn,
name=opt._name, name=opt._name,
...@@ -111,7 +110,7 @@ class LambOptimizer(MetaOptimizerBase): ...@@ -111,7 +110,7 @@ class LambOptimizer(MetaOptimizerBase):
return self.lamb_opt.apply_gradients(params_grads=params_grads) return self.lamb_opt.apply_gradients(params_grads=params_grads)
def apply_optimize(self, loss, startup_program, params_grads): def apply_optimize(self, loss, startup_program, params_grads):
return self.lamb_opt.apply_optimize( return self.lamb_opt._apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads loss, startup_program=startup_program, params_grads=params_grads
) )
......
...@@ -53,7 +53,6 @@ from . import initializer ...@@ -53,7 +53,6 @@ from . import initializer
from .initializer import set_global_initializer from .initializer import set_global_initializer
from . import layers from . import layers
from . import dygraph from . import dygraph
from . import contrib
from . import optimizer from . import optimizer
from . import backward from . import backward
from .backward import gradients from .backward import gradients
...@@ -105,7 +104,6 @@ __all__ = ( ...@@ -105,7 +104,6 @@ __all__ = (
'io', 'io',
'initializer', 'initializer',
'layers', 'layers',
'contrib',
'dygraph', 'dygraph',
'enable_dygraph', 'enable_dygraph',
'disable_dygraph', 'disable_dygraph',
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import optimizer
from .optimizer import *
__all__ = []
__all__ += optimizer.__all__
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid.optimizer import Optimizer
from paddle.regularizer import L1Decay
from paddle.regularizer import L2Decay
from paddle.fluid import core
from paddle.fluid import framework
from paddle.fluid.framework import program_guard
from paddle.fluid import unique_name
from paddle.fluid import layers
from paddle.fluid.layer_helper import LayerHelper
import warnings
from paddle import _C_ops, _legacy_C_ops
__all__ = ['Momentum']
class Momentum(Optimizer):
r"""
Simple Momentum optimizer with velocity state
This optimizer has a flag for Nestrov Momentum.
The update equations are as follows:
.. math::
& velocity = mu * velocity + gradient
& if (use\_nesterov):
&\quad param = param - (gradient + mu * velocity) * learning\_rate
& else:
&\quad param = param - learning\_rate * velocity
Parameters:
learning_rate (float|Variable): The learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
momentum (float): Momentum factor
parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static graph mode, at this time all parameters will be updated.
use_nesterov (bool, optional): Enables Nesterov momentum, default is false.
regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
:ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
Often choose to be ``1.0/batch_size``.
name (str, optional): This parameter is used by developers to print debugging information. \
For details, please refer to :ref:`api_guide_Name`. Default is None.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
import numpy as np
paddle.enable_static()
place = fluid.CPUPlace()
main = fluid.Program()
with fluid.program_guard(main):
x = paddle.static.data(name='x', shape=[1, 13], dtype='float32')
y = paddle.static.data(name='y', shape=[1], dtype='float32')
linear = paddle.nn.Linear(13, 1)
y_predict = linear(x)
cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
avg_cost = paddle.mean(cost)
moment_optimizer = fluid.contrib.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
moment_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(paddle.static.default_startup_program())
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
"""
_velocity_acc_str = "velocity"
def __init__(
self,
learning_rate,
momentum,
parameter_list=None,
use_nesterov=False,
regularization=None,
grad_clip=None,
multi_precision=False,
rescale_grad=1.0,
name=None,
):
assert learning_rate is not None
assert momentum is not None
predicate = lambda regular: isinstance(regular, L2Decay)
py_regular = None if predicate(regularization) else regularization
super().__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=py_regular,
grad_clip=grad_clip,
name=name,
)
self.type = "momentum"
self._momentum = momentum
self._use_nesterov = bool(use_nesterov)
self._regularization_method = ""
self._regularization_coeff = 0
if isinstance(regularization, L2Decay):
self._regularization_method = "l2_decay"
self._regularization_coeff = regularization._coeff
self._multi_precision = multi_precision
self._rescale_grad = rescale_grad
self._master_weights = {}
def _create_master_weight(self, param):
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
self._add_accumulator(self._velocity_acc_str, master_p)
continue
if (
p.dtype == core.VarDesc.VarType.FP16
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Momentum optimizer."
)
self._add_accumulator(self._velocity_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
velocity_acc = self._get_accumulator(
self._velocity_acc_str, param_and_grad[0]
)
lr = self._create_param_lr(param_and_grad)
find_master = (
self._multi_precision
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
if framework.in_dygraph_mode():
_, _, _ = _legacy_C_ops.momentum(
param_and_grad[0],
param_and_grad[1],
velocity_acc,
lr,
master_weight,
param_and_grad[0],
velocity_acc,
master_weight,
'mu',
self._momentum,
'use_nesterov',
self._use_nesterov,
'regularization_method',
self._regularization_method,
'regularization_coeff',
self._regularization_coeff,
'multi_precision',
find_master,
)
return None
attrs = {
"mu": self._momentum,
"use_nesterov": self._use_nesterov,
"regularization_method": self._regularization_method,
"regularization_coeff": self._regularization_coeff,
"multi_precision": find_master,
"rescale_grad": self._rescale_grad,
}
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"Velocity": [velocity_acc],
"LearningRate": [lr],
}
outputs = {
"ParamOut": [param_and_grad[0]],
"VelocityOut": [velocity_acc],
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
# create the momentum optimize op
momentum_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return momentum_op
...@@ -62,27 +62,19 @@ from ..fluid.framework import ( ...@@ -62,27 +62,19 @@ from ..fluid.framework import (
__all__ = [ __all__ = [
'SGD', 'SGD',
'Momentum', 'Momentum',
'Adagrad',
'Adam', 'Adam',
'Adamax',
'Dpsgd', 'Dpsgd',
'DecayedAdagrad', 'DecayedAdagrad',
'Ftrl', 'Ftrl',
'SGDOptimizer', 'SGDOptimizer',
'MomentumOptimizer', 'MomentumOptimizer',
'AdagradOptimizer',
'AdamOptimizer', 'AdamOptimizer',
'AdamaxOptimizer',
'DpsgdOptimizer', 'DpsgdOptimizer',
'DecayedAdagradOptimizer', 'DecayedAdagradOptimizer',
'RMSPropOptimizer',
'FtrlOptimizer', 'FtrlOptimizer',
'Adadelta',
'AdadeltaOptimizer',
'ModelAverage', 'ModelAverage',
'LarsMomentum', 'LarsMomentum',
'LarsMomentumOptimizer', 'LarsMomentumOptimizer',
'LambOptimizer',
'ExponentialMovingAverage', 'ExponentialMovingAverage',
'PipelineOptimizer', 'PipelineOptimizer',
'LookaheadOptimizer', 'LookaheadOptimizer',
...@@ -1983,181 +1975,6 @@ class LarsMomentumOptimizer(Optimizer): ...@@ -1983,181 +1975,6 @@ class LarsMomentumOptimizer(Optimizer):
return momentum_op return momentum_op
class AdagradOptimizer(Optimizer):
r"""
The Adaptive Gradient optimizer (Adagrad for short) can adaptively assign
different learning rates to individual parameters.
The parameter ``param_out`` update rule with gradient ``grad``:
.. math::
moment\_out &= moment + grad * grad
param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
Related paper: `Adaptive Subgradient Methods for Online Learning and
Stochastic Optimization <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_.
The original paper does not have the ``epsilon`` attribute. It is added here
in our implementation as also proposed `Per-parameter adaptive learning rate
methods <http://cs231n.github.io/neural-networks-3/#ada>`_
for numerical stability to avoid the division by zero error.
Args:
learning_rate (float|Variable): The learning rate used to update ``Parameter``.
It can be a float value or a ``Variable`` with a float type.
epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-06.
parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static graph mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
:ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
initial_accumulator_value (float, optional): Initial value for moment accumulator.
The default value is 0.0.
Examples:
.. code-block:: python
import paddle
import numpy as np
import paddle.fluid as fluid
paddle.enable_static()
np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
inp = paddle.static.data(name="inp", shape=[2, 2], dtype="float32")
out = paddle.static.nn.fc(inp, size=3)
out = paddle.sum(out)
optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.2)
optimizer.minimize(out)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
exe.run(
feed={"inp": np_inp},
fetch_list=[out.name])
"""
_moment_acc_str = "moment"
def __init__(
self,
learning_rate,
epsilon=1.0e-6,
parameter_list=None,
regularization=None,
grad_clip=None,
name=None,
initial_accumulator_value=0.0,
):
assert learning_rate is not None
assert epsilon is not None
super().__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name,
)
self.type = "adagrad"
self._multi_precision = False
self._epsilon = epsilon
self.initial_accumulator_value = initial_accumulator_value
self._master_weights = {}
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
for p in parameters:
if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
master_p = self._create_master_weight(p)
self._add_accumulator(
self._moment_acc_str,
master_p,
fill_value=self.initial_accumulator_value,
)
continue
if (
self._is_dtype_fp16_or_bf16(p.dtype)
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self._add_accumulator(
self._moment_acc_str,
p,
fill_value=self.initial_accumulator_value,
)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
moment_acc = self._get_accumulator_master(
self._moment_acc_str, param_and_grad[0]
)
find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
param_and_grad[0].dtype
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
if in_dygraph_mode():
_C_ops.adagrad_(
param_and_grad[0],
param_and_grad[1],
moment_acc,
self._create_param_lr(param_and_grad),
master_weight,
self._epsilon,
find_master,
)
return None
else:
# Create the adagrad optimizer op
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Moment": moment_acc,
"LearningRate": self._create_param_lr(param_and_grad),
}
outputs = {
"ParamOut": param_and_grad[0],
"MomentOut": moment_acc,
}
attrs = {"epsilon": self._epsilon, "multi_precision": find_master}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
adagrad_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return adagrad_op
class AdamOptimizer(Optimizer): class AdamOptimizer(Optimizer):
r""" r"""
The Adam optimizer uses an optimization described at the end The Adam optimizer uses an optimization described at the end
...@@ -2586,59 +2403,10 @@ class AdamOptimizer(Optimizer): ...@@ -2586,59 +2403,10 @@ class AdamOptimizer(Optimizer):
) )
class AdamaxOptimizer(Optimizer): class DpsgdOptimizer(Optimizer):
r""" r"""
The Adamax optimizer is implemented based on the Adamax Optimization We implement the Dpsgd optimizer according to CCS16 paper -
in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_. Deep Learning with Differential Privacy.
The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
which makes the learning rate update algorithm more stable and simple.
The parameter ``param_out`` update rule with gradient ``grad``:
.. math::
t & = t + 1
moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad
inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)
learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}
param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}
Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
The original paper does not have an ``epsilon`` attribute,
it is added here for numerical stability to prevent the division by 0 error.
Args:
learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``.
It can be a float value or a ``Variable`` with a float type. The default value is 0.001.
beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
The default value is 0.9.
beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08.
parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static graph mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
:ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
**Notes**:
**Currently, AdamaxOptimizer doesn't support sparse parameter optimization.**
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -2655,11 +2423,11 @@ class AdamaxOptimizer(Optimizer): ...@@ -2655,11 +2423,11 @@ class AdamaxOptimizer(Optimizer):
train_program = fluid.Program() train_program = fluid.Program()
startup_program = fluid.Program() startup_program = fluid.Program()
with fluid.program_guard(train_program, startup_program): with fluid.program_guard(train_program, startup_program):
data = paddle.static.data(name='X', shape=[None, 1], dtype='float32') data = paddle.static.data(name='X', shape=[-1,1], dtype='float32')
hidden = paddle.static.nn.fc(x=data, size=10) hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden) loss = paddle.mean(hidden)
adam = fluid.optimizer.AdamaxOptimizer(learning_rate=0.2) optimizer = fluid.optimizer.Dpsgd(learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0)
adam.minimize(loss) optimizer.minimize(loss)
# Run the startup program once and only once. # Run the startup program once and only once.
exe.run(startup_program) exe.run(startup_program)
...@@ -2668,240 +2436,46 @@ class AdamaxOptimizer(Optimizer): ...@@ -2668,240 +2436,46 @@ class AdamaxOptimizer(Optimizer):
outs = exe.run(program=train_program, outs = exe.run(program=train_program,
feed={'X': x}, feed={'X': x},
fetch_list=[loss.name]) fetch_list=[loss.name])
Args:
learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
clip (float): clipping threshold
batch_size (float): batch size.
sigma (float): for gaussian noise.
parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static graph mode, at this time all parameters will be updated.
Notes:
Currently, DpsgdOptimizer doesn't support sparse parameter optimization.
""" """
_moment_acc_str = "moment"
_inf_norm_acc_str = "inf_norm"
_beta1_pow_acc_str = "beta1_pow_acc"
def __init__( def __init__(
self, self,
learning_rate=0.001, learning_rate=0.001,
beta1=0.9, clip=0.9,
beta2=0.999, batch_size=0.999,
epsilon=1e-8, sigma=1e-8,
parameter_list=None, parameter_list=None,
regularization=None,
grad_clip=None,
name=None,
): ):
assert learning_rate is not None assert learning_rate is not None
assert beta1 is not None assert clip is not None
assert beta2 is not None assert batch_size is not None
assert epsilon is not None assert sigma is not None
super().__init__( super().__init__(
learning_rate=learning_rate, learning_rate=learning_rate, parameter_list=parameter_list
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name,
)
self.type = "adamax"
self._beta1 = beta1
self._beta2 = beta2
self._epsilon = epsilon
self._multi_precision = False
self._master_weights = {}
def _create_accumulators(self, block, parameters):
# Create accumulator tensors for first moment and infinity norm
for p in parameters:
if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
master_p = self._create_master_weight(p)
self._add_accumulator(self._moment_acc_str, master_p)
self._add_accumulator(self._inf_norm_acc_str, master_p)
self._add_accumulator(
name=self._beta1_pow_acc_str,
param=master_p,
fill_value=self._beta1,
shape=[1],
)
continue
if (
self._is_dtype_fp16_or_bf16(p.dtype)
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self._add_accumulator(self._moment_acc_str, p)
self._add_accumulator(self._inf_norm_acc_str, p)
self._add_accumulator(
name=self._beta1_pow_acc_str,
param=p,
fill_value=self._beta1,
shape=[1],
) )
self.type = "dpsgd"
def _append_optimize_op(self, block, param_and_grad): self._clip = clip
assert isinstance(block, framework.Block) self._batch_size = batch_size
self._sigma = sigma
moment = self._get_accumulator_master( '''
self._moment_acc_str, param_and_grad[0] Note(wangzhongpu):
) This property is only used for debugging, do not need to set it!
inf_norm = self._get_accumulator_master( Dpsgd operator use time(NULL) as random seed to generate random number.
self._inf_norm_acc_str, param_and_grad[0] However, during debugging, we need determinated result, so we will set self._seed to a fixed number.
) '''
beta1_pow_acc = self._get_accumulator_master( self._seed = None
self._beta1_pow_acc_str, param_and_grad[0]
)
find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
param_and_grad[0].dtype
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
if in_dygraph_mode():
_C_ops.adamax_(
param_and_grad[0],
param_and_grad[1],
self._create_param_lr(param_and_grad),
moment,
inf_norm,
beta1_pow_acc,
master_weight,
self._beta1,
self._beta2,
self._epsilon,
find_master,
)
else:
# create the adamax optimize op
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": self._create_param_lr(param_and_grad),
"Moment": moment,
"InfNorm": inf_norm,
"Beta1Pow": beta1_pow_acc,
}
outputs = {
"ParamOut": param_and_grad[0],
"MomentOut": moment,
"InfNormOut": inf_norm,
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
attrs = {
"beta1": self._beta1,
"beta2": self._beta2,
"epsilon": self._epsilon,
"multi_precision": find_master,
}
adamax_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return adamax_op
def _finish_update(self, block, parameters_and_grads):
"""Update Beta1 Power accumulator"""
assert isinstance(block, framework.Block)
for param, grad in parameters_and_grads:
if grad is None or param.trainable is False:
continue
with param.block.program._optimized_guard(
[param, grad]
), name_scope('adamx'):
beta1_pow_acc = self._get_accumulator_master(
self._beta1_pow_acc_str, param
)
if in_dygraph_mode():
tmp = _C_ops.scale(beta1_pow_acc, self._beta1, 0.0, True)
beta1_pow_acc.copy_(tmp, False)
else:
block.append_op(
type="scale",
inputs={"X": beta1_pow_acc},
outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1},
stop_gradient=True,
)
class DpsgdOptimizer(Optimizer):
r"""
We implement the Dpsgd optimizer according to CCS16 paper -
Deep Learning with Differential Privacy.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy
import paddle
paddle.enable_static()
# First create the Executor.
place = fluid.CPUPlace() # fluid.CUDAPlace(0)
exe = fluid.Executor(place)
train_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(train_program, startup_program):
data = paddle.static.data(name='X', shape=[-1,1], dtype='float32')
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer = fluid.optimizer.Dpsgd(learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0)
optimizer.minimize(loss)
# Run the startup program once and only once.
exe.run(startup_program)
x = numpy.random.random(size=(10, 1)).astype('float32')
outs = exe.run(program=train_program,
feed={'X': x},
fetch_list=[loss.name])
Args:
learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
clip (float): clipping threshold
batch_size (float): batch size.
sigma (float): for gaussian noise.
parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static graph mode, at this time all parameters will be updated.
Notes:
Currently, DpsgdOptimizer doesn't support sparse parameter optimization.
"""
def __init__(
self,
learning_rate=0.001,
clip=0.9,
batch_size=0.999,
sigma=1e-8,
parameter_list=None,
):
assert learning_rate is not None
assert clip is not None
assert batch_size is not None
assert sigma is not None
super().__init__(
learning_rate=learning_rate, parameter_list=parameter_list
)
self.type = "dpsgd"
self._clip = clip
self._batch_size = batch_size
self._sigma = sigma
'''
Note(wangzhongpu):
This property is only used for debugging, do not need to set it!
Dpsgd operator use time(NULL) as random seed to generate random number.
However, during debugging, we need determinated result, so we will set self._seed to a fixed number.
'''
self._seed = None
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -2956,364 +2530,22 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -2956,364 +2530,22 @@ class DecayedAdagradOptimizer(Optimizer):
.. math:: .. math::
moment\_out & = decay * moment + (1 - decay) * grad * grad moment\_out & = decay * moment + (1 - decay) * grad * grad
param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
Related paper: `Adaptive Subgradient Methods for Online Learning and Stochastic
Optimization <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_.
The original paper does not have an ``epsilon`` attribute. It is added here for numerical
stability to avoid the division by zero error.
Args:
learning_rate (float|Variable): The learning rate used to update ``Parameter``.
It can be a float value or a ``Variable`` with a float type.
decay (float, optional): The decay rate. The default value is 0.95.
epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-06.
parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static graph mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
:ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
**Notes**:
**Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.**
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
paddle.enable_static()
x = paddle.static.data(name='x', shape=[None, 10], dtype='float32')
trans = paddle.static.nn.fc(x, 100)
cost = paddle.mean(trans)
optimizer = fluid.optimizer.DecayedAdagradOptimizer(learning_rate=0.2)
optimizer.minimize(cost)
"""
_moment_acc_str = "moment"
def __init__(
self,
learning_rate,
decay=0.95,
epsilon=1.0e-6,
parameter_list=None,
regularization=None,
grad_clip=None,
name=None,
):
assert learning_rate is not None
assert decay is not None
assert epsilon is not None
super().__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name,
)
self.type = "decayed_adagrad"
self._decay = decay
self._epsilon = epsilon
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
for p in parameters:
self._add_accumulator(self._moment_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
moment_acc = self._get_accumulator(
self._moment_acc_str, param_and_grad[0]
)
if in_dygraph_mode():
_legacy_C_ops.decayed_adagrad(
param_and_grad[0],
param_and_grad[1],
moment_acc,
self._create_param_lr(param_and_grad),
param_and_grad[0],
moment_acc,
"epsilon",
self._epsilon,
"decay",
self._decay,
)
else:
# Create the decayed adagrad optimizer op
decayed_adagrad_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Moment": moment_acc,
"LearningRate": self._create_param_lr(param_and_grad),
},
outputs={
"ParamOut": param_and_grad[0],
"MomentOut": moment_acc,
},
attrs={"epsilon": self._epsilon, "decay": self._decay},
stop_gradient=True,
)
return decayed_adagrad_op
class AdadeltaOptimizer(Optimizer):
r"""
**Notes: This API does not support sparse parameter optimization.**
Adadelta Optimizer. Please refer to this for details:
`ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.
The update is done as follows:
.. math::
E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2
learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) }
E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2
Args:
learning_rate (float|Variable): global learning rate.
epsilon (float): a small float number for numeric stability. Default 1.0e-6.
rho (float): a floating point value indicating the decay rate. Default 0.95.
parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static graph mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
:ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): The default value is None. Normally there is no need for user
to set this property. For more information, please refer to
:ref:`api_guide_Name` .
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
paddle.enable_static()
image = paddle.static.data(name='image', shape=[None, 28], dtype='float32')
fc = paddle.static.nn.fc(image, size=10)
cost = paddle.mean(fc)
optimizer = fluid.optimizer.Adadelta(
learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
# optimizer_ops is a list of optimizer operators to update parameters
# params_grads is a list of (param, param_grad), where param is each
# parameter and param_grad is the gradient variable of param.
optimizer_ops, params_grads = optimizer.minimize(cost)
"""
_avg_squared_grad_acc_str = "_avg_squared_grad"
_avg_squared_update_acc_str = "_avg_squared_update"
def __init__(
self,
learning_rate,
epsilon=1.0e-6,
rho=0.95,
parameter_list=None,
regularization=None,
grad_clip=None,
name=None,
):
if learning_rate is None:
raise ValueError("learning_rate is not set.")
if epsilon is None:
raise ValueError("epsilon is not set.")
if rho is None:
raise ValueError("rho is not set.")
super().__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
name=name,
)
self.type = "adadelta"
self._multi_precision = False
self._master_weights = {}
self._epsilon = epsilon
self._rho = rho
def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
for p in parameters:
if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
master_p = self._create_master_weight(p)
self._add_accumulator(self._avg_squared_grad_acc_str, master_p)
self._add_accumulator(
self._avg_squared_update_acc_str, master_p
)
continue
if (
self._is_dtype_fp16_or_bf16(p.dtype)
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self._add_accumulator(self._avg_squared_grad_acc_str, p)
self._add_accumulator(self._avg_squared_update_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
avg_squared_grad_acc = self._get_accumulator_master(
self._avg_squared_grad_acc_str, param_and_grad[0]
)
avg_squared_update_acc = self._get_accumulator_master(
self._avg_squared_update_acc_str, param_and_grad[0]
)
find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
param_and_grad[0].dtype
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
if in_dygraph_mode():
_C_ops.adadelta_(
param_and_grad[0],
param_and_grad[1],
avg_squared_grad_acc,
avg_squared_update_acc,
self._create_param_lr(param_and_grad),
master_weight,
self._rho,
self._epsilon,
find_master,
)
else:
# Create the adadelta optimizer op
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"AvgSquaredGrad": avg_squared_grad_acc,
"AvgSquaredUpdate": avg_squared_update_acc,
"LearningRate": self._create_param_lr(param_and_grad),
}
outputs = {
"ParamOut": param_and_grad[0],
"AvgSquaredGradOut": avg_squared_grad_acc,
"AvgSquaredUpdateOut": avg_squared_update_acc,
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
adadelta_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs={
"epsilon": self._epsilon,
"rho": self._rho,
"multi_precision": find_master,
},
stop_gradient=True,
)
return adadelta_op
class RMSPropOptimizer(Optimizer):
r"""
Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
rate method. The original slides proposed RMSProp: Slide 29 of
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
The original equation is as follows:
.. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
The first equation calculates moving average of the squared gradient for
each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
In some cases, adding a momentum term :math: `\\beta` is beneficial.
In our implementation, Nesterov momentum is used:
.. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
\\epsilon}} \\nabla Q_{i}(w)
w & = w - v(w, t)
if centered is True:
.. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
\\epsilon}} \\nabla Q_{i}(w)
w & = w - v(w, t) param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95 Related paper: `Adaptive Subgradient Methods for Online Learning and Stochastic
and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a Optimization <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_.
smoothing term to avoid division by zero, usually set somewhere in range
from 1e-4 to 1e-8.
The original paper does not have an ``epsilon`` attribute. It is added here for numerical
stability to avoid the division by zero error.
Parameters: Args:
learning_rate(float): Global learning rate. learning_rate (float|Variable): The learning rate used to update ``Parameter``.
rho(float): rho is :math: `\\rho` in equation, default is 0.95. It can be a float value or a ``Variable`` with a float type.
epsilon(float): :math: `\\epsilon` in equation is smoothing term to decay (float, optional): The decay rate. The default value is 0.95.
avoid division by zero, default is 1e-6. epsilon (float, optional): A small float value for numerical stability.
momentum(float): :math:`\\beta` in equation is the momentum term, The default value is 1e-06.
default is 0.0.
centered(bool): If True, gradients are normalized by the estimated variance of
the gradient; if False, by the uncentered second moment. Setting this to
True may help with training, but is slightly more expensive in terms of
computation and memory. Defaults to False.
parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. \
The default value is None in static graph mode, at this time all parameters will be updated. The default value is None in static graph mode, at this time all parameters will be updated.
...@@ -3326,59 +2558,42 @@ class RMSPropOptimizer(Optimizer): ...@@ -3326,59 +2558,42 @@ class RMSPropOptimizer(Optimizer):
some derived class of ``GradientClipBase`` . There are three cliping strategies some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information. \ name (str, optional): Normally there is no need for user to set this property.
For details, please refer to :ref:`api_guide_Name`. Default is None. For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
Raises: **Notes**:
ValueError: If learning_rate, rho, epsilon, momentum are None. **Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.**
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import numpy as np
paddle.enable_static() paddle.enable_static()
place = fluid.CPUPlace() x = paddle.static.data(name='x', shape=[None, 10], dtype='float32')
main = fluid.Program() trans = paddle.static.nn.fc(x, 100)
with fluid.program_guard(main): cost = paddle.mean(trans)
x = paddle.static.data(name='x', shape=[-1, 13], dtype='float32') optimizer = fluid.optimizer.DecayedAdagradOptimizer(learning_rate=0.2)
y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') optimizer.minimize(cost)
y_predict = paddle.static.nn.fc(x, size=1, activation=None)
cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
avg_cost = paddle.mean(cost)
rms_optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
rms_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
""" """
_moment_acc_str = "moment"
_momentum_acc_str = "momentum"
_mean_square_acc_str = "mean_square"
_mean_grad_acc_str = "mean_grad"
def __init__( def __init__(
self, self,
learning_rate, learning_rate,
rho=0.95, decay=0.95,
epsilon=1.0e-6, epsilon=1.0e-6,
momentum=0.0,
centered=False,
parameter_list=None, parameter_list=None,
regularization=None, regularization=None,
grad_clip=None, grad_clip=None,
name=None, name=None,
): ):
assert learning_rate is not None
assert decay is not None
assert epsilon is not None
super().__init__( super().__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameter_list=parameter_list, parameter_list=parameter_list,
...@@ -3386,119 +2601,55 @@ class RMSPropOptimizer(Optimizer): ...@@ -3386,119 +2601,55 @@ class RMSPropOptimizer(Optimizer):
grad_clip=grad_clip, grad_clip=grad_clip,
name=name, name=name,
) )
if learning_rate is None: self.type = "decayed_adagrad"
raise ValueError("learning_rate is not set.") self._decay = decay
if rho is None:
raise ValueError("rho is not set.")
if epsilon is None:
raise ValueError("epsilon is not set.")
if momentum is None:
raise ValueError("momentum is not set.")
self.type = "rmsprop"
self._rho = rho
self._epsilon = epsilon self._epsilon = epsilon
self._momentum = momentum
self._centered = centered
self._multi_precision = False
self._master_weights = {}
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block): assert isinstance(block, framework.Block)
raise TypeError("block is not instance of framework.Block.")
for p in parameters: for p in parameters:
if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype): self._add_accumulator(self._moment_acc_str, p)
master_p = self._create_master_weight(p)
self._add_accumulator(self._momentum_acc_str, master_p)
self._add_accumulator(self._mean_square_acc_str, master_p)
self._add_accumulator(self._mean_grad_acc_str, master_p)
continue
if (
self._is_dtype_fp16_or_bf16(p.dtype)
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self._add_accumulator(self._momentum_acc_str, p)
self._add_accumulator(self._mean_square_acc_str, p)
self._add_accumulator(self._mean_grad_acc_str, p)
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
if not isinstance(block, framework.Block): assert isinstance(block, framework.Block)
raise TypeError("block is not instance of framework.Block.")
momentum_acc = self._get_accumulator_master( moment_acc = self._get_accumulator(
self._momentum_acc_str, param_and_grad[0] self._moment_acc_str, param_and_grad[0]
)
mean_square_acc = self._get_accumulator_master(
self._mean_square_acc_str, param_and_grad[0]
)
mean_grad_acc = self._get_accumulator_master(
self._mean_grad_acc_str, param_and_grad[0]
)
find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
param_and_grad[0].dtype
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
) )
if in_dygraph_mode(): if in_dygraph_mode():
_C_ops.rmsprop_( _legacy_C_ops.decayed_adagrad(
param_and_grad[0], param_and_grad[0],
mean_square_acc,
param_and_grad[1], param_and_grad[1],
momentum_acc, moment_acc,
self._create_param_lr(param_and_grad), self._create_param_lr(param_and_grad),
mean_grad_acc, param_and_grad[0],
master_weight, moment_acc,
"epsilon",
self._epsilon, self._epsilon,
self._rho, "decay",
self._momentum, self._decay,
self._centered,
find_master,
) )
return None
else: else:
inputs = { # Create the decayed adagrad optimizer op
decayed_adagrad_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0], "Param": param_and_grad[0],
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"Moment": momentum_acc, "Moment": moment_acc,
"MeanSquare": mean_square_acc,
"MeanGrad": mean_grad_acc,
"LearningRate": self._create_param_lr(param_and_grad), "LearningRate": self._create_param_lr(param_and_grad),
} },
outputs={
outputs = {
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
"MomentOut": momentum_acc, "MomentOut": moment_acc,
"MeanSquareOut": mean_square_acc,
"MeanGradOut": mean_grad_acc,
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
rmsprop_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs={
"epsilon": self._epsilon,
"decay": self._rho,
"momentum": self._momentum,
"centered": self._centered,
"multi_precision": find_master,
}, },
attrs={"epsilon": self._epsilon, "decay": self._decay},
stop_gradient=True, stop_gradient=True,
) )
return rmsprop_op return decayed_adagrad_op
class FtrlOptimizer(Optimizer): class FtrlOptimizer(Optimizer):
...@@ -3689,202 +2840,6 @@ class FtrlOptimizer(Optimizer): ...@@ -3689,202 +2840,6 @@ class FtrlOptimizer(Optimizer):
return ftrl_op return ftrl_op
class LambOptimizer(AdamOptimizer):
r"""
LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.
LAMB Optimizer is designed to scale up the batch size of training without losing
accuracy, which supports adaptive element-wise updating and accurate layer-wise
correction. For more information, please refer to `Large Batch Optimization for
Deep Learning: Training BERT in 76 minutes <https://arxiv.org/abs/1904.00962>`_ .
The updating of parameters follows:
.. math::
m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t
v_t &= \\beta_2 v_{t - 1} + (1 - \\beta_2)g_t^2
m_t &= \\frac{m_t}{\\beta_1^t}
v_t &= \\frac{v_t}{\\beta_2^t}
r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon}
w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1})
where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the
learning rate, :math:`\\lambda` the LAMB weight decay rate.
Args:
learning_rate (float|Variable, optional): the learning rate used to update parameters. \
Can be a float value or a Variable with data type float32. Default 0.001.
lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01.
beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
Default 0.9.
beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
Default 0.999.
epsilon (float, optional): A small float value for numerical stability. Default 1e-6.
parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static graph mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
:ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_paddle_fluid_clip_ClipGradByGlobalNorm` , :ref:`api_paddle_fluid_clip_ClipGradByNorm` ,
:ref:`api_paddle_fluid_clip_ClipGradByValue` ). If you want better convergence, it is recommended
to use :ref:`api_paddle_fluid_clip_ClipGradByGlobalNorm` . Default None, meaning there is no gradient clipping.
exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight
decay when **exclude_from_weight_decay_fn(parameter)** returns true.
Default None.
name(str|None): For detailed information, please refer to
:ref:`api_guide_Name` . Usually name is no need to set and None by default.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
paddle.enable_static()
data = paddle.static.data(name='x', shape=[-1, 5], dtype='float32')
hidden = paddle.static.nn.fc(x=data, size=10)
cost = paddle.mean(hidden)
def exclude_fn(param):
return param.name.endswith('.b_0')
optimizer = fluid.optimizer.Lamb(learning_rate=0.002,
exclude_from_weight_decay_fn=exclude_fn)
optimizer.minimize(cost)
"""
_moment1_acc_str = "moment1"
_moment2_acc_str = "moment2"
_beta1_pow_acc_str = "beta1_pow_acc"
_beta2_pow_acc_str = "beta2_pow_acc"
def __init__(
self,
learning_rate=0.001,
lamb_weight_decay=0.01,
beta1=0.9,
beta2=0.999,
epsilon=1e-6,
parameter_list=None,
regularization=None,
grad_clip=None,
exclude_from_weight_decay_fn=None,
name=None,
):
assert learning_rate is not None
assert lamb_weight_decay is not None
assert beta1 is not None
assert beta2 is not None
assert epsilon is not None
super().__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
beta1=beta1,
beta2=beta2,
epsilon=epsilon,
name=name,
)
self.type = "lamb"
self._weight_decay = lamb_weight_decay
self._exclude_from_weight_decay_fn = exclude_from_weight_decay_fn
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
block.program._use_lamb = True
moment1 = self._get_accumulator(
self._moment1_acc_str, param_and_grad[0]
)
moment2 = self._get_accumulator(
self._moment2_acc_str, param_and_grad[0]
)
beta1_pow_acc = self._get_accumulator(
self._beta1_pow_acc_str, param_and_grad[0]
)
beta2_pow_acc = self._get_accumulator(
self._beta2_pow_acc_str, param_and_grad[0]
)
if (
self._exclude_from_weight_decay_fn is not None
and self._exclude_from_weight_decay_fn(param_and_grad[0])
):
weight_decay = 0.0
else:
weight_decay = self._weight_decay
lr = self._create_param_lr(param_and_grad)
master_weight = None
if in_dygraph_mode():
_legacy_C_ops.lamb(
param_and_grad[0],
param_and_grad[1],
lr,
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
param_and_grad[0],
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
'beta1',
self._beta1,
'beta2',
self._beta2,
'epsilon',
self._epsilon,
'weight_decay',
weight_decay,
)
return None
# create the lamb optimize op
lamb_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": lr,
"Moment1": moment1,
"Moment2": moment2,
"Beta1Pow": beta1_pow_acc,
"Beta2Pow": beta2_pow_acc,
},
outputs={
"ParamOut": param_and_grad[0],
"Moment1Out": moment1,
"Moment2Out": moment2,
"Beta1PowOut": beta1_pow_acc,
"Beta2PowOut": beta2_pow_acc,
},
attrs={
"beta1": self._beta1,
"beta2": self._beta2,
"epsilon": self._epsilon,
"weight_decay": weight_decay,
},
stop_gradient=True,
)
return lamb_op
# We short the class name, since users will use the optimizer with the package # We short the class name, since users will use the optimizer with the package
# name. The sample code: # name. The sample code:
# #
...@@ -3895,16 +2850,11 @@ class LambOptimizer(AdamOptimizer): ...@@ -3895,16 +2850,11 @@ class LambOptimizer(AdamOptimizer):
# It is no need to add an `Optimizer` as the class suffix # It is no need to add an `Optimizer` as the class suffix
SGD = SGDOptimizer SGD = SGDOptimizer
Momentum = MomentumOptimizer Momentum = MomentumOptimizer
Adagrad = AdagradOptimizer
Adam = AdamOptimizer Adam = AdamOptimizer
Adamax = AdamaxOptimizer
Dpsgd = DpsgdOptimizer Dpsgd = DpsgdOptimizer
DecayedAdagrad = DecayedAdagradOptimizer DecayedAdagrad = DecayedAdagradOptimizer
Adadelta = AdadeltaOptimizer
RMSProp = RMSPropOptimizer
Ftrl = FtrlOptimizer Ftrl = FtrlOptimizer
LarsMomentum = LarsMomentumOptimizer LarsMomentum = LarsMomentumOptimizer
Lamb = LambOptimizer
class ModelAverage(Optimizer): class ModelAverage(Optimizer):
......
...@@ -426,7 +426,6 @@ packages=['paddle', ...@@ -426,7 +426,6 @@ packages=['paddle',
'paddle.fluid.proto', 'paddle.fluid.proto',
'paddle.fluid.proto.profiler', 'paddle.fluid.proto.profiler',
'paddle.fluid.layers', 'paddle.fluid.layers',
'paddle.fluid.contrib',
'paddle.fluid.incubate', 'paddle.fluid.incubate',
'paddle.incubate.distributed.fleet', 'paddle.incubate.distributed.fleet',
'paddle.fluid.incubate.checkpoint', 'paddle.fluid.incubate.checkpoint',
......
...@@ -1430,7 +1430,6 @@ def get_setup_parameters(): ...@@ -1430,7 +1430,6 @@ def get_setup_parameters():
'paddle.fluid.proto', 'paddle.fluid.proto',
'paddle.fluid.proto.profiler', 'paddle.fluid.proto.profiler',
'paddle.fluid.layers', 'paddle.fluid.layers',
'paddle.fluid.contrib',
'paddle.fluid.incubate', 'paddle.fluid.incubate',
'paddle.incubate.distributed.fleet', 'paddle.incubate.distributed.fleet',
'paddle.fluid.incubate.checkpoint', 'paddle.fluid.incubate.checkpoint',
......
...@@ -70,7 +70,7 @@ class TestStaticDecorate(AmpTestBase): ...@@ -70,7 +70,7 @@ class TestStaticDecorate(AmpTestBase):
) )
out = model(x) out = model(x)
loss = paddle.mean(out) loss = paddle.mean(out)
optimizer = paddle.fluid.optimizer.Adadelta(learning_rate=0.001) optimizer = paddle.optimizer.Adadelta(learning_rate=0.001)
optimizer = paddle.static.amp.decorate( optimizer = paddle.static.amp.decorate(
optimizer, optimizer,
init_loss_scaling=128.0, init_loss_scaling=128.0,
......
...@@ -84,7 +84,7 @@ def train( ...@@ -84,7 +84,7 @@ def train(
else: else:
raise NotImplementedError() raise NotImplementedError()
adagrad = fluid.optimizer.Adagrad(learning_rate=0.002) adagrad = paddle.optimizer.Adagrad(learning_rate=0.002)
adagrad.minimize(cost) adagrad.minimize(cost)
train_data = paddle.batch( train_data = paddle.batch(
......
...@@ -139,7 +139,7 @@ def train(net_type, use_cuda, save_dirname, is_local): ...@@ -139,7 +139,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
# Test program # Test program
test_program = train_program.clone(for_test=True) test_program = train_program.clone(for_test=True)
optimizer = fluid.optimizer.Lamb(learning_rate=0.001) optimizer = paddle.optimizer.Lamb(learning_rate=0.001)
amp_lists = paddle.static.amp.AutoMixedPrecisionLists( amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
custom_black_varnames={"loss", "conv2d_0.w_0"} custom_black_varnames={"loss", "conv2d_0.w_0"}
...@@ -513,7 +513,7 @@ class TestAmpWithNonIterableDataLoader(unittest.TestCase): ...@@ -513,7 +513,7 @@ class TestAmpWithNonIterableDataLoader(unittest.TestCase):
) )
avg_cost = paddle.mean(cost) avg_cost = paddle.mean(cost)
optimizer = fluid.optimizer.Lamb(learning_rate=0.001) optimizer = paddle.optimizer.Lamb(learning_rate=0.001)
amp_lists = paddle.static.amp.AutoMixedPrecisionLists( amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
custom_black_varnames={"loss", "conv2d_0.w_0"} custom_black_varnames={"loss", "conv2d_0.w_0"}
) )
......
...@@ -73,8 +73,8 @@ def train(args, place, to_static): ...@@ -73,8 +73,8 @@ def train(args, place, to_static):
policy = Policy() policy = Policy()
eps = np.finfo(np.float32).eps.item() eps = np.finfo(np.float32).eps.item()
optimizer = fluid.optimizer.AdamaxOptimizer( optimizer = paddle.optimizer.Adamax(
learning_rate=1e-2, parameter_list=policy.parameters() learning_rate=1e-2, parameters=policy.parameters()
) )
def get_mean_and_std(values=[]): def get_mean_and_std(values=[]):
......
...@@ -328,8 +328,8 @@ def train(args, to_static): ...@@ -328,8 +328,8 @@ def train(args, to_static):
model = GRU(args.vocab_size, args.batch_size, args.padding_size) model = GRU(args.vocab_size, args.batch_size, args.padding_size)
elif args.model_type == 'bigru_net': elif args.model_type == 'bigru_net':
model = BiGRU(args.vocab_size, args.batch_size, args.padding_size) model = BiGRU(args.vocab_size, args.batch_size, args.padding_size)
sgd_optimizer = fluid.optimizer.Adagrad( sgd_optimizer = paddle.optimizer.Adagrad(
learning_rate=args.lr, parameter_list=model.parameters() learning_rate=args.lr, parameters=model.parameters()
) )
loss_data = [] loss_data = []
......
...@@ -30,7 +30,7 @@ with fluid.device_guard("gpu"): ...@@ -30,7 +30,7 @@ with fluid.device_guard("gpu"):
input_y = paddle.cast(input_y, dtype="int64") input_y = paddle.cast(input_y, dtype="int64")
cost = mlp(input_x, input_y) cost = mlp(input_x, input_y)
optimizer = fluid.optimizer.Adagrad(learning_rate=0.01) optimizer = paddle.optimizer.Adagrad(learning_rate=0.01)
role = role_maker.PaddleCloudRoleMaker() role = role_maker.PaddleCloudRoleMaker()
fleet.init(role) fleet.init(role)
......
...@@ -403,118 +403,5 @@ class TestAdadeltaMultiPrecision2_0(unittest.TestCase): ...@@ -403,118 +403,5 @@ class TestAdadeltaMultiPrecision2_0(unittest.TestCase):
) )
class TestAdadeltaMultiPrecision1_0(unittest.TestCase):
def dygraph_adadelta_mp(self, use_amp, mp):
paddle.disable_static()
paddle.seed(10)
paddle.set_device('gpu')
input = paddle.randn((2, 2))
model = paddle.nn.Linear(2, 2)
optimizer = paddle.fluid.optimizer.Adadelta(
learning_rate=0.001,
parameter_list=model.parameters(),
)
optimizer._multi_precision = mp
if use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
for idx in range(5):
if use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
optimizer.clear_gradients()
else:
output = model(input)
loss = paddle.mean(output)
optimizer.minimize(loss)
optimizer.clear_gradients()
return output, model.parameters()
def static_adadelta_mp(self, use_amp, mp):
paddle.enable_static()
paddle.seed(100)
np.random.seed(100)
exe = paddle.static.Executor('gpu')
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.fluid.optimizer.Adadelta(learning_rate=0.001)
optimizer._multi_precision = mp
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
exe.run(startup_program)
if use_amp:
optimizer.amp_init(
place=paddle.CUDAPlace(0), scope=paddle.static.global_scope()
)
x = np.random.random(size=(2, 2)).astype('float16')
else:
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
"Test dygraph mode"
output1_dy, params1_dy = self.dygraph_adadelta_mp(use_amp=True, mp=True)
output2_dy, params2_dy = self.dygraph_adadelta_mp(
use_amp=False, mp=False
)
np.testing.assert_allclose(
output1_dy.astype('float32').numpy(),
output2_dy.astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
for idx in range(len(params1_dy)):
np.testing.assert_allclose(
params1_dy[idx].astype('float32').numpy(),
params2_dy[idx].astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
"Test static mode"
output1_st = self.static_adadelta_mp(use_amp=True, mp=True)
output2_st = self.static_adadelta_mp(use_amp=False, mp=False)
for idx in range(len(output1_st)):
np.testing.assert_allclose(
output1_st[idx].astype('float32'),
output2_st[idx].astype('float32'),
rtol=1e-05,
atol=0.1,
)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -369,117 +369,6 @@ class TestAdagradMultiPrecision2_0(unittest.TestCase): ...@@ -369,117 +369,6 @@ class TestAdagradMultiPrecision2_0(unittest.TestCase):
) )
class TestAdagradMultiPrecision1_0(unittest.TestCase):
def dygraph_adagrad_mp(self, use_amp, mp):
paddle.disable_static()
paddle.seed(10)
paddle.set_device('gpu')
input = paddle.randn((2, 2))
model = paddle.nn.Linear(2, 2)
optimizer = paddle.fluid.optimizer.Adagrad(
learning_rate=0.001, parameter_list=model.parameters()
)
optimizer._multi_precision = mp
if use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
for idx in range(5):
if use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
optimizer.clear_gradients()
else:
output = model(input)
loss = paddle.mean(output)
optimizer.minimize(loss)
optimizer.clear_gradients()
return output, model.parameters()
def static_adagrad_mp(self, use_amp, mp):
paddle.enable_static()
paddle.seed(100)
np.random.seed(100)
exe = paddle.static.Executor('gpu')
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.fluid.optimizer.Adagrad(learning_rate=0.001)
optimizer._multi_precision = mp
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
exe.run(startup_program)
if use_amp:
optimizer.amp_init(
place=paddle.CUDAPlace(0), scope=paddle.static.global_scope()
)
x = np.random.random(size=(2, 2)).astype('float16')
else:
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
"Test dygraph mode"
output1_dy, params1_dy = self.dygraph_adagrad_mp(use_amp=True, mp=True)
output2_dy, params2_dy = self.dygraph_adagrad_mp(
use_amp=False, mp=False
)
np.testing.assert_allclose(
output1_dy.astype('float32').numpy(),
output2_dy.astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
for idx in range(len(params1_dy)):
np.testing.assert_allclose(
params1_dy[idx].astype('float32').numpy(),
params2_dy[idx].astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
"Test static mode"
output1_st = self.static_adagrad_mp(use_amp=True, mp=True)
output2_st = self.static_adagrad_mp(use_amp=False, mp=False)
for idx in range(len(output1_st)):
np.testing.assert_allclose(
output1_st[idx].astype('float32'),
output2_st[idx].astype('float32'),
rtol=1e-05,
atol=0.1,
)
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static() paddle.enable_static()
unittest.main() unittest.main()
...@@ -397,114 +397,5 @@ class TestAdamaxMultiPrecision2_0(unittest.TestCase): ...@@ -397,114 +397,5 @@ class TestAdamaxMultiPrecision2_0(unittest.TestCase):
) )
class TestAdamaxMultiPrecision1_0(unittest.TestCase):
def dygraph_adamax_mp(self, use_amp, mp):
paddle.disable_static()
paddle.seed(10)
paddle.set_device('gpu')
input = paddle.randn((2, 2))
model = paddle.nn.Linear(2, 2)
optimizer = paddle.fluid.optimizer.Adamax(
learning_rate=0.001, parameter_list=model.parameters()
)
optimizer._multi_precision = mp
if use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
for idx in range(5):
if use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
optimizer.clear_gradients()
else:
output = model(input)
loss = paddle.mean(output)
optimizer.minimize(loss)
optimizer.clear_gradients()
return output, model.parameters()
def static_adamax_mp(self, use_amp, mp):
paddle.enable_static()
paddle.seed(100)
np.random.seed(100)
exe = paddle.static.Executor('gpu')
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.fluid.optimizer.Adamax(learning_rate=0.001)
optimizer._multi_precision = mp
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
exe.run(startup_program)
if use_amp:
optimizer.amp_init(
place=paddle.CUDAPlace(0), scope=paddle.static.global_scope()
)
x = np.random.random(size=(2, 2)).astype('float16')
else:
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
"Test dygraph mode"
output1_dy, params1_dy = self.dygraph_adamax_mp(use_amp=True, mp=True)
output2_dy, params2_dy = self.dygraph_adamax_mp(use_amp=False, mp=False)
np.testing.assert_allclose(
output1_dy.astype('float32').numpy(),
output2_dy.astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
for idx in range(len(params1_dy)):
np.testing.assert_allclose(
params1_dy[idx].astype('float32').numpy(),
params2_dy[idx].astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
"Test static mode"
output1_st = self.static_adamax_mp(use_amp=True, mp=True)
output2_st = self.static_adamax_mp(use_amp=False, mp=False)
for idx in range(len(output1_st)):
np.testing.assert_allclose(
output1_st[idx].astype('float32'),
output2_st[idx].astype('float32'),
rtol=1e-05,
atol=0.1,
)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -614,7 +614,7 @@ class TestMutiTask(unittest.TestCase): ...@@ -614,7 +614,7 @@ class TestMutiTask(unittest.TestCase):
one = paddle.tensor.fill_constant(shape=[1], dtype='int32', value=1) one = paddle.tensor.fill_constant(shape=[1], dtype='int32', value=1)
adam = optimizer.Adam(learning_rate=0.001) adam = optimizer.Adam(learning_rate=0.001)
adagrad = optimizer.Adagrad(learning_rate=0.001) adagrad = paddle.optimizer.Adagrad(learning_rate=0.001)
def fn_1(): def fn_1():
sum = paddle.multiply(x, y) sum = paddle.multiply(x, y)
......
...@@ -42,7 +42,7 @@ def train(network, use_cuda, batch_size=32, pass_num=2): ...@@ -42,7 +42,7 @@ def train(network, use_cuda, batch_size=32, pass_num=2):
cost = network(data, label, word_dict_size) cost = network(data, label, word_dict_size)
cost.persistable = True cost.persistable = True
optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) optimizer = paddle.optimizer.Adagrad(learning_rate=0.2)
optimizer.minimize(cost) optimizer.minimize(cost)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......
...@@ -23,22 +23,17 @@ from paddle import fluid ...@@ -23,22 +23,17 @@ from paddle import fluid
from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.optimizer import ( from paddle.fluid.optimizer import (
AdadeltaOptimizer,
AdagradOptimizer,
Adam, Adam,
AdamaxOptimizer,
DecayedAdagradOptimizer, DecayedAdagradOptimizer,
DpsgdOptimizer, DpsgdOptimizer,
ExponentialMovingAverage, ExponentialMovingAverage,
FtrlOptimizer, FtrlOptimizer,
LambOptimizer,
LarsMomentumOptimizer, LarsMomentumOptimizer,
LookaheadOptimizer, LookaheadOptimizer,
ModelAverage, ModelAverage,
MomentumOptimizer, MomentumOptimizer,
PipelineOptimizer, PipelineOptimizer,
RecomputeOptimizer, RecomputeOptimizer,
RMSPropOptimizer,
SGDOptimizer, SGDOptimizer,
) )
...@@ -593,13 +588,13 @@ class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase): ...@@ -593,13 +588,13 @@ class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase):
class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = AdagradOptimizer( optimizer = paddle.optimizer.Adagrad(
learning_rate=0.2, parameter_list=parameter_list learning_rate=0.2, parameters=parameter_list
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = AdagradOptimizer(learning_rate=0.2) optimizer = paddle.optimizer.Adagrad(learning_rate=0.2)
return optimizer return optimizer
def test_adagrad(self): def test_adagrad(self):
...@@ -608,13 +603,13 @@ class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): ...@@ -608,13 +603,13 @@ class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase): class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = AdamaxOptimizer( optimizer = paddle.optimizer.Adamax(
learning_rate=0.2, parameter_list=parameter_list learning_rate=0.2, parameters=parameter_list
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = AdamaxOptimizer(learning_rate=0.2) optimizer = paddle.optimizer.Adamax(learning_rate=0.2)
return optimizer return optimizer
def test_adamax(self): def test_adamax(self):
...@@ -661,16 +656,16 @@ class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase): ...@@ -661,16 +656,16 @@ class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase):
class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = AdadeltaOptimizer( optimizer = paddle.optimizer.Adadelta(
learning_rate=0.0003, learning_rate=0.0003,
epsilon=1.0e-6, epsilon=1.0e-6,
rho=0.95, rho=0.95,
parameter_list=parameter_list, parameters=parameter_list,
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = AdadeltaOptimizer( optimizer = paddle.optimizer.Adadelta(
learning_rate=0.0003, epsilon=1.0e-6, rho=0.95 learning_rate=0.0003, epsilon=1.0e-6, rho=0.95
) )
return optimizer return optimizer
...@@ -681,13 +676,13 @@ class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): ...@@ -681,13 +676,13 @@ class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase): class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = RMSPropOptimizer( optimizer = paddle.optimizer.RMSProp(
learning_rate=0.1, parameter_list=parameter_list learning_rate=0.1, parameters=parameter_list
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = RMSPropOptimizer(learning_rate=0.1) optimizer = paddle.optimizer.RMSProp(learning_rate=0.1)
return optimizer return optimizer
def test_rmsprop(self): def test_rmsprop(self):
...@@ -715,15 +710,15 @@ def exclude_fn(param): ...@@ -715,15 +710,15 @@ def exclude_fn(param):
class TestImperativeLambOptimizer(TestImperativeOptimizerBase): class TestImperativeLambOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = LambOptimizer( optimizer = paddle.optimizer.Lamb(
learning_rate=0.002, learning_rate=0.002,
exclude_from_weight_decay_fn=exclude_fn, exclude_from_weight_decay_fn=exclude_fn,
parameter_list=parameter_list, parameters=parameter_list,
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = LambOptimizer( optimizer = paddle.optimizer.Lamb(
learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn
) )
return optimizer return optimizer
......
...@@ -23,9 +23,6 @@ from paddle import fluid ...@@ -23,9 +23,6 @@ from paddle import fluid
from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.optimizer import ( from paddle.fluid.optimizer import (
AdadeltaOptimizer,
AdagradOptimizer,
AdamaxOptimizer,
DecayedAdagradOptimizer, DecayedAdagradOptimizer,
DpsgdOptimizer, DpsgdOptimizer,
ExponentialMovingAverage, ExponentialMovingAverage,
...@@ -36,7 +33,6 @@ from paddle.fluid.optimizer import ( ...@@ -36,7 +33,6 @@ from paddle.fluid.optimizer import (
MomentumOptimizer, MomentumOptimizer,
PipelineOptimizer, PipelineOptimizer,
RecomputeOptimizer, RecomputeOptimizer,
RMSPropOptimizer,
) )
# Note(wangzhongpu) # Note(wangzhongpu)
...@@ -721,13 +717,13 @@ class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase): ...@@ -721,13 +717,13 @@ class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase):
class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = AdagradOptimizer( optimizer = paddle.optimizer.Adagrad(
learning_rate=0.2, parameter_list=parameter_list learning_rate=0.2, parameters=parameter_list
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = AdagradOptimizer(learning_rate=0.2) optimizer = paddle.optimizer.Adagrad(learning_rate=0.2)
return optimizer return optimizer
def test_adagrad(self): def test_adagrad(self):
...@@ -736,13 +732,13 @@ class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): ...@@ -736,13 +732,13 @@ class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase): class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = AdamaxOptimizer( optimizer = paddle.optimizer.Adamax(
learning_rate=0.2, parameter_list=parameter_list learning_rate=0.2, parameters=parameter_list
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = AdamaxOptimizer(learning_rate=0.2) optimizer = paddle.optimizer.Adamax(learning_rate=0.2)
return optimizer return optimizer
def test_adamax(self): def test_adamax(self):
...@@ -789,16 +785,16 @@ class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase): ...@@ -789,16 +785,16 @@ class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase):
class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = AdadeltaOptimizer( optimizer = paddle.optimizer.Adadelta(
learning_rate=0.0003, learning_rate=0.0003,
epsilon=1.0e-6, epsilon=1.0e-6,
rho=0.95, rho=0.95,
parameter_list=parameter_list, parameters=parameter_list,
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = AdadeltaOptimizer( optimizer = paddle.optimizer.Adadelta(
learning_rate=0.0003, epsilon=1.0e-6, rho=0.95 learning_rate=0.0003, epsilon=1.0e-6, rho=0.95
) )
return optimizer return optimizer
...@@ -809,13 +805,13 @@ class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): ...@@ -809,13 +805,13 @@ class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase): class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = RMSPropOptimizer( optimizer = paddle.optimizer.RMSProp(
learning_rate=0.1, parameter_list=parameter_list learning_rate=0.1, parameters=parameter_list
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = RMSPropOptimizer(learning_rate=0.1) optimizer = paddle.optimizer.RMSProp(learning_rate=0.1)
return optimizer return optimizer
def test_rmsprop(self): def test_rmsprop(self):
......
...@@ -677,11 +677,11 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase): ...@@ -677,11 +677,11 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase):
out = linear(inp) out = linear(inp)
loss = paddle.mean(out) loss = paddle.mean(out)
# This can be any optimizer supported by dygraph. # This can be any optimizer supported by dygraph.
momentum = paddle.fluid.contrib.optimizer.Momentum( momentum = paddle.optimizer.Momentum(
learning_rate=0.01, learning_rate=0.01,
momentum=0.9, momentum=0.9,
parameter_list=linear.parameters(), parameters=linear.parameters(),
regularization=regularization, weight_decay=regularization,
) )
momentum.minimize(loss) momentum.minimize(loss)
...@@ -703,7 +703,7 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase): ...@@ -703,7 +703,7 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase):
) )
avg_cost = paddle.mean(cost) avg_cost = paddle.mean(cost)
momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum( momentum_optimizer = paddle.optimizer.Momentum(
learning_rate=0.1, momentum=0.9 learning_rate=0.1, momentum=0.9
) )
momentum_optimizer.minimize(avg_cost) momentum_optimizer.minimize(avg_cost)
...@@ -833,11 +833,11 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase): ...@@ -833,11 +833,11 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
weight_attr=paddle.nn.initializer.Constant(value=2.0), weight_attr=paddle.nn.initializer.Constant(value=2.0),
bias_attr=paddle.nn.initializer.Constant(value=2.0), bias_attr=paddle.nn.initializer.Constant(value=2.0),
) )
momentum_new = paddle.fluid.contrib.optimizer.Momentum( momentum_new = paddle.optimizer.Momentum(
learning_rate=0.01, learning_rate=0.01,
momentum=0.9, momentum=0.9,
parameter_list=linear_new.parameters(), parameters=linear_new.parameters(),
regularization=paddle.regularizer.L2Decay(coeff=0.1), weight_decay=paddle.regularizer.L2Decay(coeff=0.1),
) )
self.__update_params(momentum=momentum_new, linear=linear_new) self.__update_params(momentum=momentum_new, linear=linear_new)
......
...@@ -248,72 +248,6 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -248,72 +248,6 @@ class TestMomentumOptimizer(unittest.TestCase):
self.assertAlmostEqual(init_ops[0].attr('value'), 0.0) self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
class TestAdagradOptimizer(unittest.TestCase):
class MockAdagrad(optimizer.AdagradOptimizer):
def get_accumulators(self):
return self._accumulators
def get_moment_str(self):
return self._moment_acc_str
def test_adagrad_optimizer(self):
init_program = framework.Program()
program = framework.Program()
block = program.global_block()
mul_x = block.create_parameter(
dtype="float32",
shape=[5, 10],
lod_level=0,
name="mul.x",
optimize_attr={'learning_rate': 1.1},
)
mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y"
)
mul_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out"
)
block.append_op(
type="mul",
inputs={"X": mul_x, "Y": mul_y},
outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1},
)
mean_out = block.create_var(
dtype="float32", shape=[1], lod_level=0, name="mean.out"
)
block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}
)
learning_rate = 0.01
adagrad_optimizer = self.MockAdagrad(
learning_rate=learning_rate, epsilon=1.0e-6
)
params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
with framework.program_guard(program, init_program):
opts = adagrad_optimizer.apply_gradients(params_grads)
self.assertEqual(len(opts), 2)
self.assertEqual([op.type for op in opts], ["scale", "adagrad"])
# Check accumulators
accumulators = adagrad_optimizer.get_accumulators()
self.assertEqual(len(accumulators), 1)
self.assertTrue(adagrad_optimizer.get_moment_str() in accumulators)
moment_acc = accumulators[adagrad_optimizer.get_moment_str()]
self.assertEqual(len(moment_acc), 1)
self.assertTrue(mul_x.name in moment_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 2)
self.assertEqual(init_ops[1].type, "fill_constant")
self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
class TestAdamOptimizer(unittest.TestCase): class TestAdamOptimizer(unittest.TestCase):
class MockAdam(optimizer.AdamOptimizer): class MockAdam(optimizer.AdamOptimizer):
def get_accumulators(self): def get_accumulators(self):
...@@ -385,77 +319,6 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -385,77 +319,6 @@ class TestAdamOptimizer(unittest.TestCase):
self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate) self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
class TestAdamaxOptimizer(unittest.TestCase):
class MockAdamax(optimizer.AdamaxOptimizer):
def get_accumulators(self):
return self._accumulators
def get_moment_str(self):
return self._moment_acc_str
def get_inf_norm_str(self):
return self._inf_norm_acc_str
def test_adamax_optimizer(self):
init_program = framework.Program()
program = framework.Program()
block = program.global_block()
mul_x = block.create_parameter(
dtype="float32",
shape=[5, 10],
lod_level=0,
name="mul.x",
optimize_attr={'learning_rate': 1.1},
)
mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y"
)
mul_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out"
)
block.append_op(
type="mul",
inputs={"X": mul_x, "Y": mul_y},
outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1},
)
mean_out = block.create_var(
dtype="float32", shape=[1], lod_level=0, name="mean.out"
)
block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}
)
learning_rate = 0.01
adamax_optimizer = self.MockAdamax(
learning_rate=learning_rate, beta1=0.9, beta2=0.999
)
params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
with framework.program_guard(program, init_program):
opts = adamax_optimizer.apply_gradients(params_grads)
self.assertEqual(len(opts), 3)
self.assertEqual([op.type for op in opts], ["scale", "adamax", "scale"])
# Check accumulators
accumulators = adamax_optimizer.get_accumulators()
self.assertEqual(len(accumulators), 3)
self.assertTrue(adamax_optimizer.get_moment_str() in accumulators)
self.assertTrue(adamax_optimizer.get_inf_norm_str() in accumulators)
moment_acc = accumulators[adamax_optimizer.get_moment_str()]
inf_norm_acc = accumulators[adamax_optimizer.get_inf_norm_str()]
self.assertEqual(len(moment_acc), 1)
self.assertEqual(len(inf_norm_acc), 1)
self.assertTrue(mul_x.name in moment_acc)
self.assertTrue(mul_x.name in inf_norm_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 4)
self.assertEqual(init_ops[-1].type, "fill_constant")
self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
class TestDpsgdOptimizer(unittest.TestCase): class TestDpsgdOptimizer(unittest.TestCase):
def test_dpsgd_optimizer(self): def test_dpsgd_optimizer(self):
def check_dpsgd_optimizer(optimizer_attr): def check_dpsgd_optimizer(optimizer_attr):
......
...@@ -203,9 +203,9 @@ class TestRegularizer(unittest.TestCase): ...@@ -203,9 +203,9 @@ class TestRegularizer(unittest.TestCase):
avg_cost = model(data, label, self.word_len) avg_cost = model(data, label, self.word_len)
optimizer = fluid.optimizer.Adagrad( optimizer = paddle.optimizer.Adagrad(
learning_rate=0.1, learning_rate=0.1,
regularization=paddle.regularizer.L2Decay(1.0), weight_decay=paddle.regularizer.L2Decay(1.0),
) )
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
param_sum = self.run_program(place, [data, label]) param_sum = self.run_program(place, [data, label])
...@@ -236,7 +236,7 @@ class TestRegularizer(unittest.TestCase): ...@@ -236,7 +236,7 @@ class TestRegularizer(unittest.TestCase):
para_sum.append(paddle.sum(para_mul)) para_sum.append(paddle.sum(para_mul))
avg_cost_l2 += paddle.add_n(para_sum) * 0.5 avg_cost_l2 += paddle.add_n(para_sum) * 0.5
optimizer = fluid.optimizer.Adagrad(learning_rate=0.1) optimizer = paddle.optimizer.Adagrad(learning_rate=0.1)
optimizer.minimize(avg_cost_l2) optimizer.minimize(avg_cost_l2)
param_sum = self.run_program(place, [data, label]) param_sum = self.run_program(place, [data, label])
return param_sum return param_sum
......
...@@ -116,9 +116,9 @@ class TestRegularizer(unittest.TestCase): ...@@ -116,9 +116,9 @@ class TestRegularizer(unittest.TestCase):
avg_cost = model(data, label, self.word_len) avg_cost = model(data, label, self.word_len)
optimizer = fluid.optimizer.Adagrad( optimizer = paddle.optimizer.Adagrad(
learning_rate=0.1, learning_rate=0.1,
regularization=paddle.regularizer.L2Decay(1.0), weight_decay=paddle.regularizer.L2Decay(1.0),
) )
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
param_sum = self.run_program(place, [data, label]) param_sum = self.run_program(place, [data, label])
...@@ -149,7 +149,7 @@ class TestRegularizer(unittest.TestCase): ...@@ -149,7 +149,7 @@ class TestRegularizer(unittest.TestCase):
para_sum.append(paddle.sum(para_mul)) para_sum.append(paddle.sum(para_mul))
avg_cost_l2 += paddle.add_n(para_sum) * 0.5 avg_cost_l2 += paddle.add_n(para_sum) * 0.5
optimizer = fluid.optimizer.Adagrad(learning_rate=0.1) optimizer = paddle.optimizer.Adagrad(learning_rate=0.1)
optimizer.minimize(avg_cost_l2) optimizer.minimize(avg_cost_l2)
param_sum = self.run_program(place, [data, label]) param_sum = self.run_program(place, [data, label])
return param_sum return param_sum
......
...@@ -521,119 +521,6 @@ class TestRMSPropMultiPrecision2_0(unittest.TestCase): ...@@ -521,119 +521,6 @@ class TestRMSPropMultiPrecision2_0(unittest.TestCase):
) )
class TestRMSPropMultiPrecision1_0(unittest.TestCase):
def dygraph_rmsprop_mp(self, use_amp, mp):
paddle.disable_static()
paddle.seed(10)
paddle.set_device('gpu')
input = paddle.randn((2, 2))
model = paddle.nn.Linear(2, 2)
optimizer = paddle.fluid.optimizer.RMSProp(
learning_rate=0.001,
parameter_list=model.parameters(),
)
optimizer._multi_precision = mp
if use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
for idx in range(5):
if use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
optimizer.clear_gradients()
else:
output = model(input)
loss = paddle.mean(output)
optimizer.minimize(loss)
optimizer.clear_gradients()
return output, model.parameters()
def static_rmsprop_mp(self, use_amp, mp):
paddle.enable_static()
paddle.seed(100)
np.random.seed(100)
exe = paddle.static.Executor('gpu')
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.fluid.optimizer.RMSProp(learning_rate=0.001)
optimizer._multi_precision = mp
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
exe.run(startup_program)
if use_amp:
optimizer.amp_init(
place=paddle.CUDAPlace(0), scope=paddle.static.global_scope()
)
x = np.random.random(size=(2, 2)).astype('float16')
else:
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
"Test dygraph mode"
output1_dy, params1_dy = self.dygraph_rmsprop_mp(use_amp=True, mp=True)
output2_dy, params2_dy = self.dygraph_rmsprop_mp(
use_amp=False, mp=False
)
np.testing.assert_allclose(
output1_dy.astype('float32').numpy(),
output2_dy.astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
for idx in range(len(params1_dy)):
np.testing.assert_allclose(
params1_dy[idx].astype('float32').numpy(),
params2_dy[idx].astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
"Test static mode"
output1_st = self.static_rmsprop_mp(use_amp=True, mp=True)
output2_st = self.static_rmsprop_mp(use_amp=False, mp=False)
for idx in range(len(output1_st)):
np.testing.assert_allclose(
output1_st[idx].astype('float32'),
output2_st[idx].astype('float32'),
rtol=1e-05,
atol=0.1,
)
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static() paddle.enable_static()
unittest.main() unittest.main()
...@@ -73,8 +73,8 @@ class TestTrainable(unittest.TestCase): ...@@ -73,8 +73,8 @@ class TestTrainable(unittest.TestCase):
self.check_trainable( self.check_trainable(
test_trainable, test_trainable,
feed_dict, feed_dict,
op_count={'adamax': 1, 'scale': 1, 'mul_grad': 0}, op_count={'adamax': 1, 'scale': 1, 'mul_grad': 1},
optimizer=fluid.optimizer.Adamax(learning_rate=0.2), optimizer=paddle.optimizer.Adamax(learning_rate=0.2),
) )
......
...@@ -157,7 +157,7 @@ class TestWeightDecay(unittest.TestCase): ...@@ -157,7 +157,7 @@ class TestWeightDecay(unittest.TestCase):
for var in main_prog.block(0).all_parameters() for var in main_prog.block(0).all_parameters()
] ]
optimizer = fluid.optimizer.Adagrad( optimizer = paddle.optimizer.Adagrad(
learning_rate=self.learning_rate learning_rate=self.learning_rate
) )
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册