未验证 提交 94365855 编写于 作者: L LoneRanger 提交者: GitHub

replace the AdagradOptimizer...

replace the AdagradOptimizer 、adamaxOptimizer、AdadeltaOptimizer、RMSPropOptimizer、LambOptimizer and Momentum (#54152)

* replace the AdadeltaOptimizer with Adadelta

* replace the RMSPropOptimizer with RMSProp

* replace the LambOptimizer with lamb

* replace the momentum in contrib/optimizer.py with Momentum in python/paddle/optimizer/momentum.py

* fix bug

* fix bug

* fix bug

* fix bug of Lamp

* fix bug of Lamp

* fix bug of import

* replace the AdamaxOptimizer with Admax and change the optimizer base for AdagradOptimizer

* fix bug

* fix bug

* Update optimizer.py

* fix bug

* fix bug
上级 a1396a80
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
import logging import logging
import paddle
from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.optimizer import LambOptimizer as LAMB
from .meta_optimizer_base import MetaOptimizerBase from .meta_optimizer_base import MetaOptimizerBase
...@@ -55,14 +55,13 @@ class LambOptimizer(MetaOptimizerBase): ...@@ -55,14 +55,13 @@ class LambOptimizer(MetaOptimizerBase):
_exclude_from_weight_decay_fn = exclude_fn _exclude_from_weight_decay_fn = exclude_fn
self.lamb_opt = LAMB( self.lamb_opt = paddle.optimizer.Lamb(
learning_rate=opt._learning_rate, learning_rate=opt._learning_rate,
lamb_weight_decay=configs['lamb_weight_decay'], lamb_weight_decay=configs['lamb_weight_decay'],
beta1=opt._beta1, beta1=opt._beta1,
beta2=opt._beta2, beta2=opt._beta2,
epsilon=opt._epsilon, epsilon=opt._epsilon,
parameter_list=opt._parameter_list, parameters=opt._parameter_list,
regularization=opt.regularization,
grad_clip=opt._grad_clip, grad_clip=opt._grad_clip,
exclude_from_weight_decay_fn=_exclude_from_weight_decay_fn, exclude_from_weight_decay_fn=_exclude_from_weight_decay_fn,
name=opt._name, name=opt._name,
...@@ -111,7 +110,7 @@ class LambOptimizer(MetaOptimizerBase): ...@@ -111,7 +110,7 @@ class LambOptimizer(MetaOptimizerBase):
return self.lamb_opt.apply_gradients(params_grads=params_grads) return self.lamb_opt.apply_gradients(params_grads=params_grads)
def apply_optimize(self, loss, startup_program, params_grads): def apply_optimize(self, loss, startup_program, params_grads):
return self.lamb_opt.apply_optimize( return self.lamb_opt._apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads loss, startup_program=startup_program, params_grads=params_grads
) )
......
...@@ -53,7 +53,6 @@ from . import initializer ...@@ -53,7 +53,6 @@ from . import initializer
from .initializer import set_global_initializer from .initializer import set_global_initializer
from . import layers from . import layers
from . import dygraph from . import dygraph
from . import contrib
from . import optimizer from . import optimizer
from . import backward from . import backward
from .backward import gradients from .backward import gradients
...@@ -105,7 +104,6 @@ __all__ = ( ...@@ -105,7 +104,6 @@ __all__ = (
'io', 'io',
'initializer', 'initializer',
'layers', 'layers',
'contrib',
'dygraph', 'dygraph',
'enable_dygraph', 'enable_dygraph',
'disable_dygraph', 'disable_dygraph',
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import optimizer
from .optimizer import *
__all__ = []
__all__ += optimizer.__all__
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid.optimizer import Optimizer
from paddle.regularizer import L1Decay
from paddle.regularizer import L2Decay
from paddle.fluid import core
from paddle.fluid import framework
from paddle.fluid.framework import program_guard
from paddle.fluid import unique_name
from paddle.fluid import layers
from paddle.fluid.layer_helper import LayerHelper
import warnings
from paddle import _C_ops, _legacy_C_ops
__all__ = ['Momentum']
class Momentum(Optimizer):
r"""
Simple Momentum optimizer with velocity state
This optimizer has a flag for Nestrov Momentum.
The update equations are as follows:
.. math::
& velocity = mu * velocity + gradient
& if (use\_nesterov):
&\quad param = param - (gradient + mu * velocity) * learning\_rate
& else:
&\quad param = param - learning\_rate * velocity
Parameters:
learning_rate (float|Variable): The learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
momentum (float): Momentum factor
parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static graph mode, at this time all parameters will be updated.
use_nesterov (bool, optional): Enables Nesterov momentum, default is false.
regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
:ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
Often choose to be ``1.0/batch_size``.
name (str, optional): This parameter is used by developers to print debugging information. \
For details, please refer to :ref:`api_guide_Name`. Default is None.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
import numpy as np
paddle.enable_static()
place = fluid.CPUPlace()
main = fluid.Program()
with fluid.program_guard(main):
x = paddle.static.data(name='x', shape=[1, 13], dtype='float32')
y = paddle.static.data(name='y', shape=[1], dtype='float32')
linear = paddle.nn.Linear(13, 1)
y_predict = linear(x)
cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
avg_cost = paddle.mean(cost)
moment_optimizer = fluid.contrib.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
moment_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(paddle.static.default_startup_program())
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
"""
_velocity_acc_str = "velocity"
def __init__(
self,
learning_rate,
momentum,
parameter_list=None,
use_nesterov=False,
regularization=None,
grad_clip=None,
multi_precision=False,
rescale_grad=1.0,
name=None,
):
assert learning_rate is not None
assert momentum is not None
predicate = lambda regular: isinstance(regular, L2Decay)
py_regular = None if predicate(regularization) else regularization
super().__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=py_regular,
grad_clip=grad_clip,
name=name,
)
self.type = "momentum"
self._momentum = momentum
self._use_nesterov = bool(use_nesterov)
self._regularization_method = ""
self._regularization_coeff = 0
if isinstance(regularization, L2Decay):
self._regularization_method = "l2_decay"
self._regularization_coeff = regularization._coeff
self._multi_precision = multi_precision
self._rescale_grad = rescale_grad
self._master_weights = {}
def _create_master_weight(self, param):
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
self._add_accumulator(self._velocity_acc_str, master_p)
continue
if (
p.dtype == core.VarDesc.VarType.FP16
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Momentum optimizer."
)
self._add_accumulator(self._velocity_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
velocity_acc = self._get_accumulator(
self._velocity_acc_str, param_and_grad[0]
)
lr = self._create_param_lr(param_and_grad)
find_master = (
self._multi_precision
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
if framework.in_dygraph_mode():
_, _, _ = _legacy_C_ops.momentum(
param_and_grad[0],
param_and_grad[1],
velocity_acc,
lr,
master_weight,
param_and_grad[0],
velocity_acc,
master_weight,
'mu',
self._momentum,
'use_nesterov',
self._use_nesterov,
'regularization_method',
self._regularization_method,
'regularization_coeff',
self._regularization_coeff,
'multi_precision',
find_master,
)
return None
attrs = {
"mu": self._momentum,
"use_nesterov": self._use_nesterov,
"regularization_method": self._regularization_method,
"regularization_coeff": self._regularization_coeff,
"multi_precision": find_master,
"rescale_grad": self._rescale_grad,
}
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"Velocity": [velocity_acc],
"LearningRate": [lr],
}
outputs = {
"ParamOut": [param_and_grad[0]],
"VelocityOut": [velocity_acc],
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
# create the momentum optimize op
momentum_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return momentum_op
此差异已折叠。
...@@ -426,7 +426,6 @@ packages=['paddle', ...@@ -426,7 +426,6 @@ packages=['paddle',
'paddle.fluid.proto', 'paddle.fluid.proto',
'paddle.fluid.proto.profiler', 'paddle.fluid.proto.profiler',
'paddle.fluid.layers', 'paddle.fluid.layers',
'paddle.fluid.contrib',
'paddle.fluid.incubate', 'paddle.fluid.incubate',
'paddle.incubate.distributed.fleet', 'paddle.incubate.distributed.fleet',
'paddle.fluid.incubate.checkpoint', 'paddle.fluid.incubate.checkpoint',
......
...@@ -1430,7 +1430,6 @@ def get_setup_parameters(): ...@@ -1430,7 +1430,6 @@ def get_setup_parameters():
'paddle.fluid.proto', 'paddle.fluid.proto',
'paddle.fluid.proto.profiler', 'paddle.fluid.proto.profiler',
'paddle.fluid.layers', 'paddle.fluid.layers',
'paddle.fluid.contrib',
'paddle.fluid.incubate', 'paddle.fluid.incubate',
'paddle.incubate.distributed.fleet', 'paddle.incubate.distributed.fleet',
'paddle.fluid.incubate.checkpoint', 'paddle.fluid.incubate.checkpoint',
......
...@@ -70,7 +70,7 @@ class TestStaticDecorate(AmpTestBase): ...@@ -70,7 +70,7 @@ class TestStaticDecorate(AmpTestBase):
) )
out = model(x) out = model(x)
loss = paddle.mean(out) loss = paddle.mean(out)
optimizer = paddle.fluid.optimizer.Adadelta(learning_rate=0.001) optimizer = paddle.optimizer.Adadelta(learning_rate=0.001)
optimizer = paddle.static.amp.decorate( optimizer = paddle.static.amp.decorate(
optimizer, optimizer,
init_loss_scaling=128.0, init_loss_scaling=128.0,
......
...@@ -84,7 +84,7 @@ def train( ...@@ -84,7 +84,7 @@ def train(
else: else:
raise NotImplementedError() raise NotImplementedError()
adagrad = fluid.optimizer.Adagrad(learning_rate=0.002) adagrad = paddle.optimizer.Adagrad(learning_rate=0.002)
adagrad.minimize(cost) adagrad.minimize(cost)
train_data = paddle.batch( train_data = paddle.batch(
......
...@@ -139,7 +139,7 @@ def train(net_type, use_cuda, save_dirname, is_local): ...@@ -139,7 +139,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
# Test program # Test program
test_program = train_program.clone(for_test=True) test_program = train_program.clone(for_test=True)
optimizer = fluid.optimizer.Lamb(learning_rate=0.001) optimizer = paddle.optimizer.Lamb(learning_rate=0.001)
amp_lists = paddle.static.amp.AutoMixedPrecisionLists( amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
custom_black_varnames={"loss", "conv2d_0.w_0"} custom_black_varnames={"loss", "conv2d_0.w_0"}
...@@ -513,7 +513,7 @@ class TestAmpWithNonIterableDataLoader(unittest.TestCase): ...@@ -513,7 +513,7 @@ class TestAmpWithNonIterableDataLoader(unittest.TestCase):
) )
avg_cost = paddle.mean(cost) avg_cost = paddle.mean(cost)
optimizer = fluid.optimizer.Lamb(learning_rate=0.001) optimizer = paddle.optimizer.Lamb(learning_rate=0.001)
amp_lists = paddle.static.amp.AutoMixedPrecisionLists( amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
custom_black_varnames={"loss", "conv2d_0.w_0"} custom_black_varnames={"loss", "conv2d_0.w_0"}
) )
......
...@@ -73,8 +73,8 @@ def train(args, place, to_static): ...@@ -73,8 +73,8 @@ def train(args, place, to_static):
policy = Policy() policy = Policy()
eps = np.finfo(np.float32).eps.item() eps = np.finfo(np.float32).eps.item()
optimizer = fluid.optimizer.AdamaxOptimizer( optimizer = paddle.optimizer.Adamax(
learning_rate=1e-2, parameter_list=policy.parameters() learning_rate=1e-2, parameters=policy.parameters()
) )
def get_mean_and_std(values=[]): def get_mean_and_std(values=[]):
......
...@@ -328,8 +328,8 @@ def train(args, to_static): ...@@ -328,8 +328,8 @@ def train(args, to_static):
model = GRU(args.vocab_size, args.batch_size, args.padding_size) model = GRU(args.vocab_size, args.batch_size, args.padding_size)
elif args.model_type == 'bigru_net': elif args.model_type == 'bigru_net':
model = BiGRU(args.vocab_size, args.batch_size, args.padding_size) model = BiGRU(args.vocab_size, args.batch_size, args.padding_size)
sgd_optimizer = fluid.optimizer.Adagrad( sgd_optimizer = paddle.optimizer.Adagrad(
learning_rate=args.lr, parameter_list=model.parameters() learning_rate=args.lr, parameters=model.parameters()
) )
loss_data = [] loss_data = []
......
...@@ -30,7 +30,7 @@ with fluid.device_guard("gpu"): ...@@ -30,7 +30,7 @@ with fluid.device_guard("gpu"):
input_y = paddle.cast(input_y, dtype="int64") input_y = paddle.cast(input_y, dtype="int64")
cost = mlp(input_x, input_y) cost = mlp(input_x, input_y)
optimizer = fluid.optimizer.Adagrad(learning_rate=0.01) optimizer = paddle.optimizer.Adagrad(learning_rate=0.01)
role = role_maker.PaddleCloudRoleMaker() role = role_maker.PaddleCloudRoleMaker()
fleet.init(role) fleet.init(role)
......
...@@ -403,118 +403,5 @@ class TestAdadeltaMultiPrecision2_0(unittest.TestCase): ...@@ -403,118 +403,5 @@ class TestAdadeltaMultiPrecision2_0(unittest.TestCase):
) )
class TestAdadeltaMultiPrecision1_0(unittest.TestCase):
def dygraph_adadelta_mp(self, use_amp, mp):
paddle.disable_static()
paddle.seed(10)
paddle.set_device('gpu')
input = paddle.randn((2, 2))
model = paddle.nn.Linear(2, 2)
optimizer = paddle.fluid.optimizer.Adadelta(
learning_rate=0.001,
parameter_list=model.parameters(),
)
optimizer._multi_precision = mp
if use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
for idx in range(5):
if use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
optimizer.clear_gradients()
else:
output = model(input)
loss = paddle.mean(output)
optimizer.minimize(loss)
optimizer.clear_gradients()
return output, model.parameters()
def static_adadelta_mp(self, use_amp, mp):
paddle.enable_static()
paddle.seed(100)
np.random.seed(100)
exe = paddle.static.Executor('gpu')
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.fluid.optimizer.Adadelta(learning_rate=0.001)
optimizer._multi_precision = mp
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
exe.run(startup_program)
if use_amp:
optimizer.amp_init(
place=paddle.CUDAPlace(0), scope=paddle.static.global_scope()
)
x = np.random.random(size=(2, 2)).astype('float16')
else:
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
"Test dygraph mode"
output1_dy, params1_dy = self.dygraph_adadelta_mp(use_amp=True, mp=True)
output2_dy, params2_dy = self.dygraph_adadelta_mp(
use_amp=False, mp=False
)
np.testing.assert_allclose(
output1_dy.astype('float32').numpy(),
output2_dy.astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
for idx in range(len(params1_dy)):
np.testing.assert_allclose(
params1_dy[idx].astype('float32').numpy(),
params2_dy[idx].astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
"Test static mode"
output1_st = self.static_adadelta_mp(use_amp=True, mp=True)
output2_st = self.static_adadelta_mp(use_amp=False, mp=False)
for idx in range(len(output1_st)):
np.testing.assert_allclose(
output1_st[idx].astype('float32'),
output2_st[idx].astype('float32'),
rtol=1e-05,
atol=0.1,
)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -369,117 +369,6 @@ class TestAdagradMultiPrecision2_0(unittest.TestCase): ...@@ -369,117 +369,6 @@ class TestAdagradMultiPrecision2_0(unittest.TestCase):
) )
class TestAdagradMultiPrecision1_0(unittest.TestCase):
def dygraph_adagrad_mp(self, use_amp, mp):
paddle.disable_static()
paddle.seed(10)
paddle.set_device('gpu')
input = paddle.randn((2, 2))
model = paddle.nn.Linear(2, 2)
optimizer = paddle.fluid.optimizer.Adagrad(
learning_rate=0.001, parameter_list=model.parameters()
)
optimizer._multi_precision = mp
if use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
for idx in range(5):
if use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
optimizer.clear_gradients()
else:
output = model(input)
loss = paddle.mean(output)
optimizer.minimize(loss)
optimizer.clear_gradients()
return output, model.parameters()
def static_adagrad_mp(self, use_amp, mp):
paddle.enable_static()
paddle.seed(100)
np.random.seed(100)
exe = paddle.static.Executor('gpu')
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.fluid.optimizer.Adagrad(learning_rate=0.001)
optimizer._multi_precision = mp
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
exe.run(startup_program)
if use_amp:
optimizer.amp_init(
place=paddle.CUDAPlace(0), scope=paddle.static.global_scope()
)
x = np.random.random(size=(2, 2)).astype('float16')
else:
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
"Test dygraph mode"
output1_dy, params1_dy = self.dygraph_adagrad_mp(use_amp=True, mp=True)
output2_dy, params2_dy = self.dygraph_adagrad_mp(
use_amp=False, mp=False
)
np.testing.assert_allclose(
output1_dy.astype('float32').numpy(),
output2_dy.astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
for idx in range(len(params1_dy)):
np.testing.assert_allclose(
params1_dy[idx].astype('float32').numpy(),
params2_dy[idx].astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
"Test static mode"
output1_st = self.static_adagrad_mp(use_amp=True, mp=True)
output2_st = self.static_adagrad_mp(use_amp=False, mp=False)
for idx in range(len(output1_st)):
np.testing.assert_allclose(
output1_st[idx].astype('float32'),
output2_st[idx].astype('float32'),
rtol=1e-05,
atol=0.1,
)
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static() paddle.enable_static()
unittest.main() unittest.main()
...@@ -397,114 +397,5 @@ class TestAdamaxMultiPrecision2_0(unittest.TestCase): ...@@ -397,114 +397,5 @@ class TestAdamaxMultiPrecision2_0(unittest.TestCase):
) )
class TestAdamaxMultiPrecision1_0(unittest.TestCase):
def dygraph_adamax_mp(self, use_amp, mp):
paddle.disable_static()
paddle.seed(10)
paddle.set_device('gpu')
input = paddle.randn((2, 2))
model = paddle.nn.Linear(2, 2)
optimizer = paddle.fluid.optimizer.Adamax(
learning_rate=0.001, parameter_list=model.parameters()
)
optimizer._multi_precision = mp
if use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
for idx in range(5):
if use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
optimizer.clear_gradients()
else:
output = model(input)
loss = paddle.mean(output)
optimizer.minimize(loss)
optimizer.clear_gradients()
return output, model.parameters()
def static_adamax_mp(self, use_amp, mp):
paddle.enable_static()
paddle.seed(100)
np.random.seed(100)
exe = paddle.static.Executor('gpu')
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.fluid.optimizer.Adamax(learning_rate=0.001)
optimizer._multi_precision = mp
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
exe.run(startup_program)
if use_amp:
optimizer.amp_init(
place=paddle.CUDAPlace(0), scope=paddle.static.global_scope()
)
x = np.random.random(size=(2, 2)).astype('float16')
else:
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
"Test dygraph mode"
output1_dy, params1_dy = self.dygraph_adamax_mp(use_amp=True, mp=True)
output2_dy, params2_dy = self.dygraph_adamax_mp(use_amp=False, mp=False)
np.testing.assert_allclose(
output1_dy.astype('float32').numpy(),
output2_dy.astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
for idx in range(len(params1_dy)):
np.testing.assert_allclose(
params1_dy[idx].astype('float32').numpy(),
params2_dy[idx].astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
"Test static mode"
output1_st = self.static_adamax_mp(use_amp=True, mp=True)
output2_st = self.static_adamax_mp(use_amp=False, mp=False)
for idx in range(len(output1_st)):
np.testing.assert_allclose(
output1_st[idx].astype('float32'),
output2_st[idx].astype('float32'),
rtol=1e-05,
atol=0.1,
)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -614,7 +614,7 @@ class TestMutiTask(unittest.TestCase): ...@@ -614,7 +614,7 @@ class TestMutiTask(unittest.TestCase):
one = paddle.tensor.fill_constant(shape=[1], dtype='int32', value=1) one = paddle.tensor.fill_constant(shape=[1], dtype='int32', value=1)
adam = optimizer.Adam(learning_rate=0.001) adam = optimizer.Adam(learning_rate=0.001)
adagrad = optimizer.Adagrad(learning_rate=0.001) adagrad = paddle.optimizer.Adagrad(learning_rate=0.001)
def fn_1(): def fn_1():
sum = paddle.multiply(x, y) sum = paddle.multiply(x, y)
......
...@@ -42,7 +42,7 @@ def train(network, use_cuda, batch_size=32, pass_num=2): ...@@ -42,7 +42,7 @@ def train(network, use_cuda, batch_size=32, pass_num=2):
cost = network(data, label, word_dict_size) cost = network(data, label, word_dict_size)
cost.persistable = True cost.persistable = True
optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) optimizer = paddle.optimizer.Adagrad(learning_rate=0.2)
optimizer.minimize(cost) optimizer.minimize(cost)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......
...@@ -23,22 +23,17 @@ from paddle import fluid ...@@ -23,22 +23,17 @@ from paddle import fluid
from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.optimizer import ( from paddle.fluid.optimizer import (
AdadeltaOptimizer,
AdagradOptimizer,
Adam, Adam,
AdamaxOptimizer,
DecayedAdagradOptimizer, DecayedAdagradOptimizer,
DpsgdOptimizer, DpsgdOptimizer,
ExponentialMovingAverage, ExponentialMovingAverage,
FtrlOptimizer, FtrlOptimizer,
LambOptimizer,
LarsMomentumOptimizer, LarsMomentumOptimizer,
LookaheadOptimizer, LookaheadOptimizer,
ModelAverage, ModelAverage,
MomentumOptimizer, MomentumOptimizer,
PipelineOptimizer, PipelineOptimizer,
RecomputeOptimizer, RecomputeOptimizer,
RMSPropOptimizer,
SGDOptimizer, SGDOptimizer,
) )
...@@ -593,13 +588,13 @@ class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase): ...@@ -593,13 +588,13 @@ class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase):
class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = AdagradOptimizer( optimizer = paddle.optimizer.Adagrad(
learning_rate=0.2, parameter_list=parameter_list learning_rate=0.2, parameters=parameter_list
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = AdagradOptimizer(learning_rate=0.2) optimizer = paddle.optimizer.Adagrad(learning_rate=0.2)
return optimizer return optimizer
def test_adagrad(self): def test_adagrad(self):
...@@ -608,13 +603,13 @@ class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): ...@@ -608,13 +603,13 @@ class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase): class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = AdamaxOptimizer( optimizer = paddle.optimizer.Adamax(
learning_rate=0.2, parameter_list=parameter_list learning_rate=0.2, parameters=parameter_list
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = AdamaxOptimizer(learning_rate=0.2) optimizer = paddle.optimizer.Adamax(learning_rate=0.2)
return optimizer return optimizer
def test_adamax(self): def test_adamax(self):
...@@ -661,16 +656,16 @@ class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase): ...@@ -661,16 +656,16 @@ class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase):
class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = AdadeltaOptimizer( optimizer = paddle.optimizer.Adadelta(
learning_rate=0.0003, learning_rate=0.0003,
epsilon=1.0e-6, epsilon=1.0e-6,
rho=0.95, rho=0.95,
parameter_list=parameter_list, parameters=parameter_list,
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = AdadeltaOptimizer( optimizer = paddle.optimizer.Adadelta(
learning_rate=0.0003, epsilon=1.0e-6, rho=0.95 learning_rate=0.0003, epsilon=1.0e-6, rho=0.95
) )
return optimizer return optimizer
...@@ -681,13 +676,13 @@ class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): ...@@ -681,13 +676,13 @@ class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase): class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = RMSPropOptimizer( optimizer = paddle.optimizer.RMSProp(
learning_rate=0.1, parameter_list=parameter_list learning_rate=0.1, parameters=parameter_list
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = RMSPropOptimizer(learning_rate=0.1) optimizer = paddle.optimizer.RMSProp(learning_rate=0.1)
return optimizer return optimizer
def test_rmsprop(self): def test_rmsprop(self):
...@@ -715,15 +710,15 @@ def exclude_fn(param): ...@@ -715,15 +710,15 @@ def exclude_fn(param):
class TestImperativeLambOptimizer(TestImperativeOptimizerBase): class TestImperativeLambOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = LambOptimizer( optimizer = paddle.optimizer.Lamb(
learning_rate=0.002, learning_rate=0.002,
exclude_from_weight_decay_fn=exclude_fn, exclude_from_weight_decay_fn=exclude_fn,
parameter_list=parameter_list, parameters=parameter_list,
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = LambOptimizer( optimizer = paddle.optimizer.Lamb(
learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn
) )
return optimizer return optimizer
......
...@@ -23,9 +23,6 @@ from paddle import fluid ...@@ -23,9 +23,6 @@ from paddle import fluid
from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.optimizer import ( from paddle.fluid.optimizer import (
AdadeltaOptimizer,
AdagradOptimizer,
AdamaxOptimizer,
DecayedAdagradOptimizer, DecayedAdagradOptimizer,
DpsgdOptimizer, DpsgdOptimizer,
ExponentialMovingAverage, ExponentialMovingAverage,
...@@ -36,7 +33,6 @@ from paddle.fluid.optimizer import ( ...@@ -36,7 +33,6 @@ from paddle.fluid.optimizer import (
MomentumOptimizer, MomentumOptimizer,
PipelineOptimizer, PipelineOptimizer,
RecomputeOptimizer, RecomputeOptimizer,
RMSPropOptimizer,
) )
# Note(wangzhongpu) # Note(wangzhongpu)
...@@ -721,13 +717,13 @@ class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase): ...@@ -721,13 +717,13 @@ class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase):
class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = AdagradOptimizer( optimizer = paddle.optimizer.Adagrad(
learning_rate=0.2, parameter_list=parameter_list learning_rate=0.2, parameters=parameter_list
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = AdagradOptimizer(learning_rate=0.2) optimizer = paddle.optimizer.Adagrad(learning_rate=0.2)
return optimizer return optimizer
def test_adagrad(self): def test_adagrad(self):
...@@ -736,13 +732,13 @@ class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): ...@@ -736,13 +732,13 @@ class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase): class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = AdamaxOptimizer( optimizer = paddle.optimizer.Adamax(
learning_rate=0.2, parameter_list=parameter_list learning_rate=0.2, parameters=parameter_list
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = AdamaxOptimizer(learning_rate=0.2) optimizer = paddle.optimizer.Adamax(learning_rate=0.2)
return optimizer return optimizer
def test_adamax(self): def test_adamax(self):
...@@ -789,16 +785,16 @@ class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase): ...@@ -789,16 +785,16 @@ class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase):
class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = AdadeltaOptimizer( optimizer = paddle.optimizer.Adadelta(
learning_rate=0.0003, learning_rate=0.0003,
epsilon=1.0e-6, epsilon=1.0e-6,
rho=0.95, rho=0.95,
parameter_list=parameter_list, parameters=parameter_list,
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = AdadeltaOptimizer( optimizer = paddle.optimizer.Adadelta(
learning_rate=0.0003, epsilon=1.0e-6, rho=0.95 learning_rate=0.0003, epsilon=1.0e-6, rho=0.95
) )
return optimizer return optimizer
...@@ -809,13 +805,13 @@ class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): ...@@ -809,13 +805,13 @@ class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase): class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = RMSPropOptimizer( optimizer = paddle.optimizer.RMSProp(
learning_rate=0.1, parameter_list=parameter_list learning_rate=0.1, parameters=parameter_list
) )
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = RMSPropOptimizer(learning_rate=0.1) optimizer = paddle.optimizer.RMSProp(learning_rate=0.1)
return optimizer return optimizer
def test_rmsprop(self): def test_rmsprop(self):
......
...@@ -677,11 +677,11 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase): ...@@ -677,11 +677,11 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase):
out = linear(inp) out = linear(inp)
loss = paddle.mean(out) loss = paddle.mean(out)
# This can be any optimizer supported by dygraph. # This can be any optimizer supported by dygraph.
momentum = paddle.fluid.contrib.optimizer.Momentum( momentum = paddle.optimizer.Momentum(
learning_rate=0.01, learning_rate=0.01,
momentum=0.9, momentum=0.9,
parameter_list=linear.parameters(), parameters=linear.parameters(),
regularization=regularization, weight_decay=regularization,
) )
momentum.minimize(loss) momentum.minimize(loss)
...@@ -703,7 +703,7 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase): ...@@ -703,7 +703,7 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase):
) )
avg_cost = paddle.mean(cost) avg_cost = paddle.mean(cost)
momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum( momentum_optimizer = paddle.optimizer.Momentum(
learning_rate=0.1, momentum=0.9 learning_rate=0.1, momentum=0.9
) )
momentum_optimizer.minimize(avg_cost) momentum_optimizer.minimize(avg_cost)
...@@ -833,11 +833,11 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase): ...@@ -833,11 +833,11 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
weight_attr=paddle.nn.initializer.Constant(value=2.0), weight_attr=paddle.nn.initializer.Constant(value=2.0),
bias_attr=paddle.nn.initializer.Constant(value=2.0), bias_attr=paddle.nn.initializer.Constant(value=2.0),
) )
momentum_new = paddle.fluid.contrib.optimizer.Momentum( momentum_new = paddle.optimizer.Momentum(
learning_rate=0.01, learning_rate=0.01,
momentum=0.9, momentum=0.9,
parameter_list=linear_new.parameters(), parameters=linear_new.parameters(),
regularization=paddle.regularizer.L2Decay(coeff=0.1), weight_decay=paddle.regularizer.L2Decay(coeff=0.1),
) )
self.__update_params(momentum=momentum_new, linear=linear_new) self.__update_params(momentum=momentum_new, linear=linear_new)
......
...@@ -248,72 +248,6 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -248,72 +248,6 @@ class TestMomentumOptimizer(unittest.TestCase):
self.assertAlmostEqual(init_ops[0].attr('value'), 0.0) self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
class TestAdagradOptimizer(unittest.TestCase):
class MockAdagrad(optimizer.AdagradOptimizer):
def get_accumulators(self):
return self._accumulators
def get_moment_str(self):
return self._moment_acc_str
def test_adagrad_optimizer(self):
init_program = framework.Program()
program = framework.Program()
block = program.global_block()
mul_x = block.create_parameter(
dtype="float32",
shape=[5, 10],
lod_level=0,
name="mul.x",
optimize_attr={'learning_rate': 1.1},
)
mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y"
)
mul_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out"
)
block.append_op(
type="mul",
inputs={"X": mul_x, "Y": mul_y},
outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1},
)
mean_out = block.create_var(
dtype="float32", shape=[1], lod_level=0, name="mean.out"
)
block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}
)
learning_rate = 0.01
adagrad_optimizer = self.MockAdagrad(
learning_rate=learning_rate, epsilon=1.0e-6
)
params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
with framework.program_guard(program, init_program):
opts = adagrad_optimizer.apply_gradients(params_grads)
self.assertEqual(len(opts), 2)
self.assertEqual([op.type for op in opts], ["scale", "adagrad"])
# Check accumulators
accumulators = adagrad_optimizer.get_accumulators()
self.assertEqual(len(accumulators), 1)
self.assertTrue(adagrad_optimizer.get_moment_str() in accumulators)
moment_acc = accumulators[adagrad_optimizer.get_moment_str()]
self.assertEqual(len(moment_acc), 1)
self.assertTrue(mul_x.name in moment_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 2)
self.assertEqual(init_ops[1].type, "fill_constant")
self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
class TestAdamOptimizer(unittest.TestCase): class TestAdamOptimizer(unittest.TestCase):
class MockAdam(optimizer.AdamOptimizer): class MockAdam(optimizer.AdamOptimizer):
def get_accumulators(self): def get_accumulators(self):
...@@ -385,77 +319,6 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -385,77 +319,6 @@ class TestAdamOptimizer(unittest.TestCase):
self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate) self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
class TestAdamaxOptimizer(unittest.TestCase):
class MockAdamax(optimizer.AdamaxOptimizer):
def get_accumulators(self):
return self._accumulators
def get_moment_str(self):
return self._moment_acc_str
def get_inf_norm_str(self):
return self._inf_norm_acc_str
def test_adamax_optimizer(self):
init_program = framework.Program()
program = framework.Program()
block = program.global_block()
mul_x = block.create_parameter(
dtype="float32",
shape=[5, 10],
lod_level=0,
name="mul.x",
optimize_attr={'learning_rate': 1.1},
)
mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y"
)
mul_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out"
)
block.append_op(
type="mul",
inputs={"X": mul_x, "Y": mul_y},
outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1},
)
mean_out = block.create_var(
dtype="float32", shape=[1], lod_level=0, name="mean.out"
)
block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}
)
learning_rate = 0.01
adamax_optimizer = self.MockAdamax(
learning_rate=learning_rate, beta1=0.9, beta2=0.999
)
params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
with framework.program_guard(program, init_program):
opts = adamax_optimizer.apply_gradients(params_grads)
self.assertEqual(len(opts), 3)
self.assertEqual([op.type for op in opts], ["scale", "adamax", "scale"])
# Check accumulators
accumulators = adamax_optimizer.get_accumulators()
self.assertEqual(len(accumulators), 3)
self.assertTrue(adamax_optimizer.get_moment_str() in accumulators)
self.assertTrue(adamax_optimizer.get_inf_norm_str() in accumulators)
moment_acc = accumulators[adamax_optimizer.get_moment_str()]
inf_norm_acc = accumulators[adamax_optimizer.get_inf_norm_str()]
self.assertEqual(len(moment_acc), 1)
self.assertEqual(len(inf_norm_acc), 1)
self.assertTrue(mul_x.name in moment_acc)
self.assertTrue(mul_x.name in inf_norm_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 4)
self.assertEqual(init_ops[-1].type, "fill_constant")
self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
class TestDpsgdOptimizer(unittest.TestCase): class TestDpsgdOptimizer(unittest.TestCase):
def test_dpsgd_optimizer(self): def test_dpsgd_optimizer(self):
def check_dpsgd_optimizer(optimizer_attr): def check_dpsgd_optimizer(optimizer_attr):
......
...@@ -203,9 +203,9 @@ class TestRegularizer(unittest.TestCase): ...@@ -203,9 +203,9 @@ class TestRegularizer(unittest.TestCase):
avg_cost = model(data, label, self.word_len) avg_cost = model(data, label, self.word_len)
optimizer = fluid.optimizer.Adagrad( optimizer = paddle.optimizer.Adagrad(
learning_rate=0.1, learning_rate=0.1,
regularization=paddle.regularizer.L2Decay(1.0), weight_decay=paddle.regularizer.L2Decay(1.0),
) )
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
param_sum = self.run_program(place, [data, label]) param_sum = self.run_program(place, [data, label])
...@@ -236,7 +236,7 @@ class TestRegularizer(unittest.TestCase): ...@@ -236,7 +236,7 @@ class TestRegularizer(unittest.TestCase):
para_sum.append(paddle.sum(para_mul)) para_sum.append(paddle.sum(para_mul))
avg_cost_l2 += paddle.add_n(para_sum) * 0.5 avg_cost_l2 += paddle.add_n(para_sum) * 0.5
optimizer = fluid.optimizer.Adagrad(learning_rate=0.1) optimizer = paddle.optimizer.Adagrad(learning_rate=0.1)
optimizer.minimize(avg_cost_l2) optimizer.minimize(avg_cost_l2)
param_sum = self.run_program(place, [data, label]) param_sum = self.run_program(place, [data, label])
return param_sum return param_sum
......
...@@ -116,9 +116,9 @@ class TestRegularizer(unittest.TestCase): ...@@ -116,9 +116,9 @@ class TestRegularizer(unittest.TestCase):
avg_cost = model(data, label, self.word_len) avg_cost = model(data, label, self.word_len)
optimizer = fluid.optimizer.Adagrad( optimizer = paddle.optimizer.Adagrad(
learning_rate=0.1, learning_rate=0.1,
regularization=paddle.regularizer.L2Decay(1.0), weight_decay=paddle.regularizer.L2Decay(1.0),
) )
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
param_sum = self.run_program(place, [data, label]) param_sum = self.run_program(place, [data, label])
...@@ -149,7 +149,7 @@ class TestRegularizer(unittest.TestCase): ...@@ -149,7 +149,7 @@ class TestRegularizer(unittest.TestCase):
para_sum.append(paddle.sum(para_mul)) para_sum.append(paddle.sum(para_mul))
avg_cost_l2 += paddle.add_n(para_sum) * 0.5 avg_cost_l2 += paddle.add_n(para_sum) * 0.5
optimizer = fluid.optimizer.Adagrad(learning_rate=0.1) optimizer = paddle.optimizer.Adagrad(learning_rate=0.1)
optimizer.minimize(avg_cost_l2) optimizer.minimize(avg_cost_l2)
param_sum = self.run_program(place, [data, label]) param_sum = self.run_program(place, [data, label])
return param_sum return param_sum
......
...@@ -521,119 +521,6 @@ class TestRMSPropMultiPrecision2_0(unittest.TestCase): ...@@ -521,119 +521,6 @@ class TestRMSPropMultiPrecision2_0(unittest.TestCase):
) )
class TestRMSPropMultiPrecision1_0(unittest.TestCase):
def dygraph_rmsprop_mp(self, use_amp, mp):
paddle.disable_static()
paddle.seed(10)
paddle.set_device('gpu')
input = paddle.randn((2, 2))
model = paddle.nn.Linear(2, 2)
optimizer = paddle.fluid.optimizer.RMSProp(
learning_rate=0.001,
parameter_list=model.parameters(),
)
optimizer._multi_precision = mp
if use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
for idx in range(5):
if use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
optimizer.clear_gradients()
else:
output = model(input)
loss = paddle.mean(output)
optimizer.minimize(loss)
optimizer.clear_gradients()
return output, model.parameters()
def static_rmsprop_mp(self, use_amp, mp):
paddle.enable_static()
paddle.seed(100)
np.random.seed(100)
exe = paddle.static.Executor('gpu')
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.fluid.optimizer.RMSProp(learning_rate=0.001)
optimizer._multi_precision = mp
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
exe.run(startup_program)
if use_amp:
optimizer.amp_init(
place=paddle.CUDAPlace(0), scope=paddle.static.global_scope()
)
x = np.random.random(size=(2, 2)).astype('float16')
else:
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
"Test dygraph mode"
output1_dy, params1_dy = self.dygraph_rmsprop_mp(use_amp=True, mp=True)
output2_dy, params2_dy = self.dygraph_rmsprop_mp(
use_amp=False, mp=False
)
np.testing.assert_allclose(
output1_dy.astype('float32').numpy(),
output2_dy.astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
for idx in range(len(params1_dy)):
np.testing.assert_allclose(
params1_dy[idx].astype('float32').numpy(),
params2_dy[idx].astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
"Test static mode"
output1_st = self.static_rmsprop_mp(use_amp=True, mp=True)
output2_st = self.static_rmsprop_mp(use_amp=False, mp=False)
for idx in range(len(output1_st)):
np.testing.assert_allclose(
output1_st[idx].astype('float32'),
output2_st[idx].astype('float32'),
rtol=1e-05,
atol=0.1,
)
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static() paddle.enable_static()
unittest.main() unittest.main()
...@@ -73,8 +73,8 @@ class TestTrainable(unittest.TestCase): ...@@ -73,8 +73,8 @@ class TestTrainable(unittest.TestCase):
self.check_trainable( self.check_trainable(
test_trainable, test_trainable,
feed_dict, feed_dict,
op_count={'adamax': 1, 'scale': 1, 'mul_grad': 0}, op_count={'adamax': 1, 'scale': 1, 'mul_grad': 1},
optimizer=fluid.optimizer.Adamax(learning_rate=0.2), optimizer=paddle.optimizer.Adamax(learning_rate=0.2),
) )
......
...@@ -157,7 +157,7 @@ class TestWeightDecay(unittest.TestCase): ...@@ -157,7 +157,7 @@ class TestWeightDecay(unittest.TestCase):
for var in main_prog.block(0).all_parameters() for var in main_prog.block(0).all_parameters()
] ]
optimizer = fluid.optimizer.Adagrad( optimizer = paddle.optimizer.Adagrad(
learning_rate=self.learning_rate learning_rate=self.learning_rate
) )
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册