未验证 提交 e122e164 编写于 作者: Z Zhou Wei 提交者: GitHub

fix english doc, unittest, and remove useless alias of 2.0 lr_scheduler (#27686)

* fix doc and unittest of 2.0 lr_scheduler

* fix doc of 2.0 lr_scheduler

* fix unittest

* fix english doc of lr_scheduler

* fix api name of lr scheduler

* fix api name of lr scheduler
上级 9215ad96
......@@ -237,13 +237,6 @@ from .framework import save #DEFINE_ALIAS
from .framework import load #DEFINE_ALIAS
from .framework import DataParallel #DEFINE_ALIAS
from .framework import NoamDecay #DEFINE_ALIAS
from .framework import PiecewiseDecay #DEFINE_ALIAS
from .framework import NaturalExpDecay #DEFINE_ALIAS
from .framework import ExponentialDecay #DEFINE_ALIAS
from .framework import InverseTimeDecay #DEFINE_ALIAS
from .framework import PolynomialDecay #DEFINE_ALIAS
from .framework import CosineDecay #DEFINE_ALIAS
from .framework import set_default_dtype #DEFINE_ALIAS
from .framework import get_default_dtype #DEFINE_ALIAS
......
......@@ -164,7 +164,7 @@ def load_dygraph(model_path, **configs):
state_dict = emb.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy")
scheduler = paddle.optimizer.lr_scheduler.NoamLR(
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam(
learning_rate=scheduler,
......
......@@ -855,7 +855,7 @@ class Executor(object):
def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
return_numpy, return_merged):
from paddle.optimizer.lr_scheduler import _LRScheduler
from paddle.optimizer.lr import LRScheduler
exe = program._executor
# TODO(zhenghuihuang): quantization uses Graph in CompiledProgram
# instead of program. We will add support for checking Vars in Graph
......@@ -901,7 +901,7 @@ class Executor(object):
if hasattr(program._program, 'lr_sheduler'):
lr_sheduler = program._program.lr_sheduler
assert isinstance(lr_sheduler, _LRScheduler), "must be _LRScheduler"
assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
lr_value = lr_sheduler()
lr_var = program._program.global_block().vars[lr_sheduler._var_name]
lr_tensor = _as_lodtensor(lr_value, core.CPUPlace(), lr_var.dtype)
......@@ -1238,7 +1238,7 @@ class Executor(object):
def _run_program(self, program, feed, fetch_list, feed_var_name,
fetch_var_name, scope, return_numpy, use_program_cache):
from paddle.optimizer.lr_scheduler import _LRScheduler
from paddle.optimizer.lr import LRScheduler
if feed is None:
feed = {}
elif isinstance(feed, (list, tuple)):
......@@ -1296,7 +1296,7 @@ class Executor(object):
self._feed_data(program, feed, feed_var_name, scope)
if hasattr(program, 'lr_sheduler'):
assert isinstance(program.lr_sheduler,
_LRScheduler), "must be _LRScheduler"
LRScheduler), "must be LRScheduler"
lr_sheduler = program.lr_sheduler
lr_value = lr_sheduler()
lr_var = program.global_block().vars[lr_sheduler._var_name]
......
......@@ -70,15 +70,15 @@ class Optimizer(object):
grad_clip=None,
name=None):
# Because of the loop import, so place it in the function body
from paddle.optimizer.lr_scheduler import _LRScheduler
from paddle.optimizer.lr import LRScheduler
self._parameter_list = list(
parameter_list) if parameter_list is not None else None
self._name = name
if framework.in_dygraph_mode():
if not isinstance(learning_rate,
(float, LearningRateDecay, _LRScheduler)):
(float, LearningRateDecay, LRScheduler)):
raise TypeError(
"learning rate should be float or _LRScheduler, got %s here"
"learning rate should be float or LRScheduler, got %s here"
% type(learning_rate))
if self._parameter_list is None:
raise AttributeError(
......@@ -94,9 +94,9 @@ class Optimizer(object):
break
else:
if not isinstance(learning_rate,
(float, framework.Variable, _LRScheduler)):
(float, framework.Variable, LRScheduler)):
raise TypeError(
"learning rate should be float or _LRScheduler, got %s here"
"learning rate should be float or LRScheduler, got %s here"
% type(learning_rate))
if grad_clip is not None:
......@@ -147,13 +147,13 @@ class Optimizer(object):
state_dict = adam.state_dict()
'''
from paddle.optimizer.lr_scheduler import _LRScheduler
from paddle.optimizer.lr import LRScheduler
state_dict = {}
for k, v in self._accumulators.items():
for para_name, var_tmp in v.items():
state_dict[var_tmp.name] = var_tmp
# global step if use lr decay
if isinstance(self._learning_rate, _LRScheduler):
if isinstance(self._learning_rate, LRScheduler):
state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
return state_dict
if isinstance(self._learning_rate, LearningRateDecay):
......@@ -193,7 +193,7 @@ class Optimizer(object):
state_dict = emb.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy")
scheduler = paddle.optimizer.lr_scheduler.NoamLR(
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam(
learning_rate=scheduler,
......@@ -203,8 +203,8 @@ class Optimizer(object):
para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
'''
from paddle.optimizer.lr_scheduler import _LRScheduler
if isinstance(self._learning_rate, _LRScheduler):
from paddle.optimizer.lr import LRScheduler
if isinstance(self._learning_rate, LRScheduler):
self._learning_rate.set_dict(state_dict["LR_Scheduler"])
if isinstance(self._learning_rate, LearningRateDecay):
......@@ -269,8 +269,8 @@ class Optimizer(object):
return self._opti_name_list
def _create_global_learning_rate(self):
from paddle.optimizer.lr_scheduler import _LRScheduler
if isinstance(self._learning_rate, _LRScheduler):
from paddle.optimizer.lr import LRScheduler
if isinstance(self._learning_rate, LRScheduler):
lr_var = self._global_learning_rate()
# only create global lr_var once
if not isinstance(lr_var, framework.Variable):
......
......@@ -455,8 +455,8 @@ class TestAdamOpV2(unittest.TestCase):
state_dict = adam.state_dict()
adam.set_state_dict(state_dict)
#learning_rate is _LRScheduler
learning_rate = paddle.optimizer.CosineAnnealingLR(
#learning_rate is LRScheduler
learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(
learning_rate=0.1, T_max=10)
adam = paddle.optimizer.Adam(
learning_rate=learning_rate,
......
......@@ -43,14 +43,22 @@ class TestDirectory(unittest.TestCase):
'paddle.distributed.prepare_context', 'paddle.DataParallel',
'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
'paddle.jit.save', 'paddle.jit.load', 'paddle.NoamDecay',
'paddle.PiecewiseDecay', 'paddle.NaturalExpDecay',
'paddle.ExponentialDecay', 'paddle.InverseTimeDecay',
'paddle.PolynomialDecay', 'paddle.CosineDecay',
'paddle.static.Executor', 'paddle.static.global_scope',
'paddle.static.scope_guard', 'paddle.static.append_backward',
'paddle.static.gradients', 'paddle.static.BuildStrategy',
'paddle.static.CompiledProgram', 'paddle.static.ExecutionStrategy',
'paddle.jit.save', 'paddle.jit.load',
'paddle.optimizer.lr.LRScheduler', 'paddle.optimizer.lr.NoamDecay',
'paddle.optimizer.lr.PiecewiseDecay',
'paddle.optimizer.lr.NaturalExpDecay',
'paddle.optimizer.lr.ExponentialDecay',
'paddle.optimizer.lr.InverseTimeDecay',
'paddle.optimizer.lr.PolynomialDecay',
'paddle.optimizer.lr.CosineAnnealingDecay',
'paddle.optimizer.lr.MultiStepDecay',
'paddle.optimizer.lr.StepDecay', 'paddle.optimizer.lr.LambdaDecay',
'paddle.optimizer.lr.ReduceOnPlateau',
'paddle.optimizer.lr.LinearWarmup', 'paddle.static.Executor',
'paddle.static.global_scope', 'paddle.static.scope_guard',
'paddle.static.append_backward', 'paddle.static.gradients',
'paddle.static.BuildStrategy', 'paddle.static.CompiledProgram',
'paddle.static.ExecutionStrategy',
'paddle.static.default_main_program',
'paddle.static.default_startup_program', 'paddle.static.Program',
'paddle.static.name_scope', 'paddle.static.program_guard',
......
......@@ -23,7 +23,7 @@ import itertools
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer, Adam, MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer
from paddle.fluid.optimizer import MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer
from paddle.fluid.optimizer import ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer
from paddle.fluid.dygraph import Linear
from paddle.fluid.dygraph.base import to_variable
......@@ -72,15 +72,17 @@ class TestImperativeOptimizerBase(unittest.TestCase):
place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace()
with fluid.dygraph.guard(place):
try:
paddle.manual_seed(seed)
paddle.framework.random._manual_program_seed(seed)
mlp = MLP()
optimizer = self.get_optimizer_dygraph(
parameter_list=mlp.parameters())
except Exception as e:
assert str(e) == exception_message
try:
paddle.disable_static()
paddle.manual_seed(seed)
paddle.framework.random._manual_program_seed(seed)
mlp = MLP()
optimizer = self.get_optimizer_dygraph(
parameter_list=mlp.parameters())
except Exception as e:
assert str(e) == exception_message
finally:
paddle.enable_static()
def _check_mlp(self, place=None):
seed = 90
......@@ -90,47 +92,55 @@ class TestImperativeOptimizerBase(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
with fluid.dygraph.guard(place):
paddle.manual_seed(seed)
paddle.framework.random._manual_program_seed(seed)
paddle.disable_static(place)
paddle.manual_seed(seed)
paddle.framework.random._manual_program_seed(seed)
mlp = MLP()
optimizer = self.get_optimizer_dygraph(
parameter_list=mlp.parameters())
mlp = MLP()
optimizer = self.get_optimizer_dygraph(parameter_list=mlp.parameters())
batch_py_reader = fluid.io.PyReader(capacity=1)
batch_py_reader.decorate_sample_list_generator(
paddle.batch(
self.reader_decorator(paddle.dataset.mnist.train()),
batch_size=batch_size,
drop_last=True),
places=fluid.CPUPlace())
batch_py_reader = fluid.io.PyReader(capacity=1)
batch_py_reader.decorate_sample_list_generator(
paddle.batch(
self.reader_decorator(paddle.dataset.mnist.train()),
batch_size=batch_size,
drop_last=True),
places=fluid.CPUPlace())
dy_param_init_value = {}
for batch_id, data in enumerate(batch_py_reader()):
if batch_id >= self.batch_num:
break
dy_param_init_value = {}
for batch_id, data in enumerate(batch_py_reader()):
if batch_id >= self.batch_num:
break
img = data[0]
label = data[1]
label.stop_gradient = True
img = data[0]
label = data[1]
img = fluid.layers.reshape(img, shape=[batch_size, -1])
cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost)
dy_out = avg_loss.numpy()
label.stop_gradient = True
if batch_id == 0:
for param in mlp.parameters():
dy_param_init_value[param.name] = param.numpy()
img = fluid.layers.reshape(img, shape=[batch_size, -1])
cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost)
dy_out = avg_loss.numpy()
avg_loss.backward()
optimizer.minimize(avg_loss)
mlp.clear_gradients()
dy_param_value = {}
if batch_id == 0:
for param in mlp.parameters():
dy_param_value[param.name] = param.numpy()
dy_param_init_value[param.name] = param.numpy()
avg_loss.backward()
optimizer.minimize(avg_loss)
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.LRScheduler):
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.ReduceOnPlateau):
optimizer._learning_rate.step(avg_loss)
else:
optimizer._learning_rate.step()
mlp.clear_gradients()
dy_param_value = {}
for param in mlp.parameters():
dy_param_value[param.name] = param.numpy()
paddle.enable_static()
with new_program_scope():
paddle.manual_seed(seed)
paddle.framework.random._manual_program_seed(seed)
......@@ -181,6 +191,13 @@ class TestImperativeOptimizerBase(unittest.TestCase):
feed={"pixel": static_x_data,
"label": y_data},
fetch_list=fetch_list)
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.LRScheduler):
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.ReduceOnPlateau):
optimizer._learning_rate.step(out[0])
else:
optimizer._learning_rate.step()
static_param_value = {}
static_out = out[0]
......@@ -199,17 +216,19 @@ class TestImperativeOptimizerBase(unittest.TestCase):
class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
bd = [3, 6, 9]
optimizer = SGDOptimizer(
learning_rate=paddle.optimizer.PiecewiseLR(
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd,
values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
parameter_list=parameter_list)
parameters=parameter_list)
return optimizer
def get_optimizer(self):
bd = [3, 6, 9]
optimizer = SGDOptimizer(learning_rate=paddle.optimizer.PiecewiseLR(
boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd,
values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
return optimizer
def test_sgd(self):
......@@ -218,21 +237,16 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.NaturalExpDecay(
learning_rate=0.5, gamma=0.9),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.NaturalExpDecay(
learning_rate=0.5, gamma=0.9))
return optimizer
def test_sgd(self):
......@@ -241,21 +255,16 @@ class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.exponential_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.ExponentialDecay(
learning_rate=0.5, gamma=0.9),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.ExponentialDecay(
learning_rate=0.5, gamma=0.9))
return optimizer
def test_sgd(self):
......@@ -264,21 +273,16 @@ class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = Adam(
learning_rate=fluid.layers.inverse_time_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
optimizer = paddle.optimizer.Adam(
learning_rate=paddle.optimizer.lr.InverseTimeDecay(
learning_rate=0.5, gamma=0.9),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
optimizer = paddle.optimizer.Adam(
learning_rate=paddle.optimizer.lr.InverseTimeDecay(
learning_rate=0.5, gamma=0.9))
return optimizer
def test_adam(self):
......@@ -287,15 +291,16 @@ class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.polynomial_decay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle),
parameter_list=parameter_list)
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.PolynomialDecay(
learning_rate=0.5, decay_steps=5, cycle=self.cycle),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle))
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.PolynomialDecay(
learning_rate=0.5, decay_steps=5, cycle=self.cycle))
return optimizer
def test_sgd_cycle(self):
......@@ -307,17 +312,18 @@ class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
self._check_mlp()
class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerCosineAnnealingDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.cosine_decay(
learning_rate=0.1, step_each_epoch=10000, epochs=120),
parameter_list=parameter_list)
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.CosineAnnealingDecay(
learning_rate=0.5, T_max=5),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
learning_rate=0.1, step_each_epoch=10000, epochs=120))
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.CosineAnnealingDecay(
learning_rate=0.5, T_max=5))
return optimizer
def test_sgd(self):
......@@ -326,15 +332,110 @@ class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.noam_decay(
d_model=512, warmup_steps=8000),
parameter_list=parameter_list)
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerLambdaDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.LambdaDecay(
learning_rate=0.5, lr_lambda=lambda epoch: 0.9**epoch),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.LambdaDecay(
learning_rate=0.5, lr_lambda=lambda epoch: 0.9**epoch))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerLinearWarmup(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.LinearWarmup(
learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.LinearWarmup(
learning_rate=0.5,
warmup_steps=20,
start_lr=0,
end_lr=0.5,
verbose=True))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerMultiStepDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerStepLR(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.StepDecay(
learning_rate=0.5, step_size=5, gamma=0.8),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.StepDecay(
learning_rate=0.5, step_size=5, gamma=0.8))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerReduceOnPlateau(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.ReduceOnPlateau(
learning_rate=0.5),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
d_model=512, warmup_steps=8000))
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.ReduceOnPlateau(
learning_rate=0.5))
return optimizer
def test_sgd(self):
......@@ -381,7 +482,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
bd = [2, 4, 6, 8]
value = [0.2, 0.4, 0.6, 0.8, 1.0]
scheduler = paddle.optimizer.PiecewiseLR(bd, value)
scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value)
adam = paddle.optimizer.Adam(
scheduler, parameters=linear.parameters())
......@@ -396,7 +497,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
scheduler.step()
def test_lr_decay_natural_exp(self):
def test_lr_scheduler_natural_exp(self):
with fluid.dygraph.guard():
a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
......@@ -407,8 +508,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
loss = fluid.layers.reduce_mean(b)
base_lr = 1.0
scheduler = paddle.optimizer.NaturalExpLR(1.0, gamma=0.5)
print("scheduler.last_lr", scheduler.last_lr)
scheduler = paddle.optimizer.lr.NaturalExpDecay(1.0, gamma=0.5)
adam = paddle.optimizer.Adam(
scheduler, parameters=linear.parameters())
......@@ -453,7 +553,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
with self.assertRaises(RuntimeError):
adam = paddle.optimizer.Adam(
paddle.optimizer.NaturalExpLR(
paddle.optimizer.lr.NaturalExpDecay(
learning_rate=0.1, gamma=0.5),
parameters=linear.parameters())
adam.set_lr(0.01)
......@@ -695,10 +795,10 @@ class TestImperativeOptimizerList(unittest.TestCase):
linear_1 = Linear(10, 10)
linear_2 = Linear(10, 10)
sgd = SGDOptimizer(
1.0,
parameter_list=itertools.chain(linear_1.parameters(),
linear_2.parameters()))
sgd = paddle.optimizer.SGD(1.0,
parameters=itertools.chain(
linear_1.parameters(),
linear_2.parameters()))
in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
in_data = fluid.dygraph.to_variable(in_np)
......
......@@ -239,7 +239,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR(
scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr)
adam = Adam(
learning_rate=scheduler, parameters=ptb_model.parameters())
......@@ -328,7 +328,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR(
scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr)
adam = Adam(
learning_rate=scheduler, parameters=ptb_model.parameters())
......@@ -436,7 +436,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR(
scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr)
adam = Adam(
learning_rate=scheduler, parameters=ptb_model.parameters())
......@@ -544,7 +544,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR(
scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr)
adam = Adam(
learning_rate=scheduler, parameters=ptb_model.parameters())
......@@ -829,7 +829,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR(
scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr)
adam = Adam(
learning_rate=scheduler,
......
......@@ -56,22 +56,22 @@ def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss,
return var_list[1]
class TestReduceLROnPlateauDecay(object):
class TestReduceOnPlateauDecay(object):
def test_ReduceLR(self):
# the decay rate must be less than 1.0
with self.assertRaises(ValueError):
paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=2.0)
paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=2.0)
# the mode must be "min" or "max"
with self.assertRaises(ValueError):
paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, mode="test")
paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, mode="test")
# the threshold_mode must be "rel" or "abs"
with self.assertRaises(ValueError):
paddle.optimizer.ReduceLROnPlateau(
paddle.optimizer.lr.ReduceOnPlateau(
learning_rate=1.0, threshold_mode="test")
with self.assertRaises(TypeError):
paddle.optimizer.ReduceLROnPlateau(learning_rate="test")
paddle.optimizer.lr.ReduceOnPlateau(learning_rate="test")
with self.assertRaises(TypeError):
paddle.optimizer.ReduceLROnPlateau(learning_rate=0.5).step("test")
paddle.optimizer.lr.ReduceOnPlateau(learning_rate=0.5).step("test")
places = [paddle.CPUPlace()]
if core.is_compiled_with_cuda():
......@@ -114,7 +114,7 @@ class TestReduceLROnPlateauDecay(object):
[1], 1, 'float32', persistable=True)
paddle.increment(x)
loss = paddle.sin(x)
scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
scheduler = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
adam = paddle.optimizer.Adam(learning_rate=scheduler)
adam.minimize(loss)
lr_var = adam._global_learning_rate()
......@@ -158,7 +158,7 @@ class TestReduceLROnPlateauDecay(object):
var_list = [best, current_lr, cooldown_counter, num_bad_epochs]
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
scheduler = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
adam = paddle.optimizer.Adam(
learning_rate=scheduler, parameters=linear.parameters())
......@@ -180,7 +180,7 @@ class TestReduceLROnPlateauDecay(object):
loss, var_list)
self.assertEqual(current_lr, expected_lr)
state_dict = adam.state_dict()
scheduler1 = paddle.optimizer.ReduceLROnPlateau(**kwargs)
scheduler1 = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
adam1 = paddle.optimizer.Adam(
learning_rate=scheduler1, parameters=linear.parameters())
adam1.set_state_dict(state_dict)
......@@ -420,7 +420,7 @@ class TestLRScheduler(unittest.TestCase):
adam.clear_grad()
current_lr = adam.get_lr()
expected_lr = python_func(epoch, **kwarg)
if paddle_api.__name__ != "CosineAnnealingLR":
if paddle_api.__name__ != "CosineAnnealingDecay":
self.assertEqual(current_lr, expected_lr)
scheduler.step()
else:
......@@ -429,74 +429,75 @@ class TestLRScheduler(unittest.TestCase):
def test_scheduler(self):
with self.assertRaises(NotImplementedError):
paddle.optimizer.lr_scheduler._LRScheduler().step()
paddle.optimizer.lr.LRScheduler().step()
with self.assertRaises(TypeError):
paddle.optimizer.MultiStepLR(
paddle.optimizer.lr.MultiStepDecay(
learning_rate="test", milestones=[1, 2, 3])
with self.assertRaises(TypeError):
paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones='test')
paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones='test')
with self.assertRaises(ValueError):
paddle.optimizer.MultiStepLR(
paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones=[3, 2, 1])
with self.assertRaises(ValueError):
paddle.optimizer.MultiStepLR(
paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones=[1, 2, 3], gamma=2)
func_api_kwargs = [(noam_lr, paddle.optimizer.NoamLR, {
func_api_kwargs = [(noam_lr, paddle.optimizer.lr.NoamDecay, {
"d_model": 0.01,
"warmup_steps": 100,
"verbose": False
}), (piecewise_lr, paddle.optimizer.PiecewiseLR, {
}), (piecewise_lr, paddle.optimizer.lr.PiecewiseDecay, {
"boundaries": [3, 6, 9, 15, 20],
"values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
"verbose": False
}), (natural_exp_lr, paddle.optimizer.NaturalExpLR, {
}), (natural_exp_lr, paddle.optimizer.lr.NaturalExpDecay, {
"learning_rate": 0.5,
"gamma": 0.1,
"verbose": True
}), (inverse_time_lr, paddle.optimizer.InverseTimeLR, {
}), (inverse_time_lr, paddle.optimizer.lr.InverseTimeDecay, {
"learning_rate": 0.5,
"gamma": 0.1,
"verbose": False
}), (polynomial_lr, paddle.optimizer.PolynomialLR, {
}), (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, {
"learning_rate": 0.5,
"decay_steps": 20,
"end_lr": 0,
"power": 1.0,
"cycle": False,
"verbose": True
}), (polynomial_lr, paddle.optimizer.PolynomialLR, {
}), (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, {
"learning_rate": 0.5,
"decay_steps": 20,
"end_lr": 0,
"power": 1.0,
"cycle": True,
"verbose": False
}), (linear_warmup_lr, paddle.optimizer.LinearLrWarmup, {
}), (linear_warmup_lr, paddle.optimizer.lr.LinearWarmup, {
'learning_rate': 0.5,
'warmup_steps': 20,
'start_lr': 0,
'end_lr': 0.5,
"verbose": True
}), (exponential_lr, paddle.optimizer.ExponentialLR, {
}), (exponential_lr, paddle.optimizer.lr.ExponentialDecay, {
"learning_rate": 0.5,
"gamma": 0.9,
"verbose": False
}), (multi_step_lr, paddle.optimizer.MultiStepLR, {
}), (multi_step_lr, paddle.optimizer.lr.MultiStepDecay, {
"learning_rate": 0.5,
"milestones": [3, 6, 9, 15, 20],
"gamma": 0.8,
"verbose": True
}), (step_lr, paddle.optimizer.StepLR, {
}), (step_lr, paddle.optimizer.lr.StepDecay, {
"learning_rate": 0.5,
"step_size": 2,
"gamma": 0.8,
"verbose": False
}), (lambda_lr, paddle.optimizer.LambdaLR, {
}), (lambda_lr, paddle.optimizer.lr.LambdaDecay, {
"learning_rate": 0.5,
"lr_lambda": lambda x: 0.95**x,
"verbose": True
}), (cosine_annealing_lr, paddle.optimizer.CosineAnnealingLR, {
}), (cosine_annealing_lr, paddle.optimizer.lr.CosineAnnealingDecay, {
"learning_rate": 0.5,
"T_max": 10,
"verbose": False
......
......@@ -24,11 +24,6 @@ __all__ += [
'DataParallel'
]
__all__ += [
'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay'
]
from . import random
from .random import manual_seed
from .framework import get_default_dtype
......@@ -51,11 +46,3 @@ from ..fluid.dygraph.base import grad #DEFINE_ALIAS
from .io import save
from .io import load
from ..fluid.dygraph.parallel import DataParallel #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import NoamDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import PiecewiseDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import NaturalExpDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import ExponentialDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import InverseTimeDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import PolynomialDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import CosineDecay #DEFINE_ALIAS
......@@ -228,7 +228,7 @@ def save(obj, path):
layer_state_dict = emb.state_dict()
paddle.save(layer_state_dict, "emb.pdparams")
scheduler = paddle.optimizer.lr_scheduler.NoamLR(
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam(
learning_rate=scheduler,
......@@ -320,7 +320,7 @@ def load(path, **configs):
layer_state_dict = emb.state_dict()
paddle.save(layer_state_dict, "emb.pdparams")
scheduler = paddle.optimizer.lr_scheduler.NoamLR(
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam(
learning_rate=scheduler,
......
......@@ -121,13 +121,6 @@ from .layer.conv import ConvTranspose3d #DEFINE_ALIAS
# from .layer.conv import TreeConv #DEFINE_ALIAS
# from .layer.conv import Conv1D #DEFINE_ALIAS
from .layer.extension import RowConv #DEFINE_ALIAS
# from .layer.learning_rate import CosineDecay #DEFINE_ALIAS
# from .layer.learning_rate import ExponentialDecay #DEFINE_ALIAS
# from .layer.learning_rate import InverseTimeDecay #DEFINE_ALIAS
# from .layer.learning_rate import NaturalExpDecay #DEFINE_ALIAS
# from .layer.learning_rate import NoamDecay #DEFINE_ALIAS
# from .layer.learning_rate import PiecewiseDecay #DEFINE_ALIAS
# from .layer.learning_rate import PolynomialDecay #DEFINE_ALIAS
from .layer.common import Linear
# from .layer.loss import NCELoss #DEFINE_ALIAS
from .layer.loss import BCEWithLogitsLoss #DEFINE_ALIAS
......
......@@ -95,14 +95,6 @@ from .extension import target_assign #DEFINE_ALIAS
from .extension import temporal_shift #DEFINE_ALIAS
from .extension import warpctc #DEFINE_ALIAS
from .extension import diag_embed #DEFINE_ALIAS
from .learning_rate import cosine_decay #DEFINE_ALIAS
from .learning_rate import exponential_decay #DEFINE_ALIAS
from .learning_rate import inverse_time_decay #DEFINE_ALIAS
from .learning_rate import natural_exp_decay #DEFINE_ALIAS
from .learning_rate import noam_decay #DEFINE_ALIAS
from .learning_rate import piecewise_decay #DEFINE_ALIAS
from .learning_rate import polynomial_decay #DEFINE_ALIAS
from .learning_rate import linear_lr_warmup #DEFINE_ALIAS
# from .lod import sequence_concat #DEFINE_ALIAS
# from .lod import sequence_conv #DEFINE_ALIAS
# from .lod import sequence_enumerate #DEFINE_ALIAS
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: define learning rate decay
from ...fluid.layers import cosine_decay #DEFINE_ALIAS
from ...fluid.layers import exponential_decay #DEFINE_ALIAS
from ...fluid.layers import inverse_time_decay #DEFINE_ALIAS
from ...fluid.layers import natural_exp_decay #DEFINE_ALIAS
from ...fluid.layers import noam_decay #DEFINE_ALIAS
from ...fluid.layers import piecewise_decay #DEFINE_ALIAS
from ...fluid.layers import polynomial_decay #DEFINE_ALIAS
from ...fluid.layers import linear_lr_warmup #DEFINE_ALIAS
__all__ = [
'cosine_decay', 'exponential_decay', 'inverse_time_decay',
'natural_exp_decay', 'noam_decay', 'piecewise_decay', 'polynomial_decay',
'linear_lr_warmup'
]
......@@ -86,13 +86,6 @@ from .conv import ConvTranspose3d #DEFINE_ALIAS
# from .conv import TreeConv #DEFINE_ALIAS
# from .conv import Conv1D #DEFINE_ALIAS
from .extension import RowConv #DEFINE_ALIAS
# from .learning_rate import CosineDecay #DEFINE_ALIAS
# from .learning_rate import ExponentialDecay #DEFINE_ALIAS
# from .learning_rate import InverseTimeDecay #DEFINE_ALIAS
# from .learning_rate import NaturalExpDecay #DEFINE_ALIAS
# from .learning_rate import NoamDecay #DEFINE_ALIAS
# from .learning_rate import PiecewiseDecay #DEFINE_ALIAS
# from .learning_rate import PolynomialDecay #DEFINE_ALIAS
# from .loss import NCELoss #DEFINE_ALIAS
from .loss import BCEWithLogitsLoss #DEFINE_ALIAS
from .loss import CrossEntropyLoss #DEFINE_ALIAS
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: define learning rate decay
__all__ = [
# 'CosineDecay',
# 'ExponentialDecay',
# 'InverseTimeDecay',
# 'NaturalExpDecay',
# 'NoamDecay',
# 'PiecewiseDecay',
# 'PolynomialDecay'
]
......@@ -16,10 +16,7 @@ __all__ = [
'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam',
'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer', 'Dpsgd',
'DpsgdOptimizer', 'Ftrl', 'FtrlOptimizer', 'Momentum', 'MomentumOptimizer',
'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer', '_LRScheduler', 'NoamLR',
'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR',
'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR',
'ReduceLROnPlateau', 'CosineAnnealingLR'
'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer'
]
......@@ -36,6 +33,4 @@ from .adadelta import Adadelta
from .sgd import SGD
from .momentum import Momentum
from . import lr_scheduler
from .lr_scheduler import _LRScheduler, NoamLR, PiecewiseLR, NaturalExpLR, InverseTimeLR, PolynomialLR, \
LinearLrWarmup, ExponentialLR, MultiStepLR, StepLR, LambdaLR, ReduceLROnPlateau, CosineAnnealingLR
from . import lr
......@@ -48,8 +48,8 @@ class Adam(Optimizer):
Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
Args:
learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler. The default value is 0.001.
learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LRScheduler. The default value is 0.001.
beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 0.9.
......
......@@ -47,8 +47,8 @@ class Adamax(Optimizer):
it is added here for numerical stability to prevent the division by 0 error.
Args:
learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler. The default value is 0.001.
learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LRScheduler. The default value is 0.001.
beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
The default value is 0.9.
beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
......
......@@ -42,8 +42,8 @@ class AdamW(Adam):
Args:
learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler. The default value is 0.001.
learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LRScheduler. The default value is 0.001.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
......
......@@ -18,18 +18,62 @@ import warnings
from paddle import Tensor
__all__ = [
'NoamLR', 'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR',
'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR',
'ReduceLROnPlateau', 'CosineAnnealingLR'
'LRScheduler', 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay',
'InverseTimeDecay', 'PolynomialDecay', 'LinearWarmup', 'ExponentialDecay',
'MultiStepDecay', 'StepDecay', 'LambdaDecay', 'ReduceOnPlateau',
'CosineAnnealingDecay'
]
class _LRScheduler(object):
"""LRScheduler Base class.
class LRScheduler(object):
"""
LRScheduler Base class. Define the common interface of a learning rate scheduler.
User can import it by ``form paddle.optimizer.lr import LRScheduler`` ,
then overload it for your subclass and have a custom implementation of ``get_lr()`` .
Otherwise, an ``NotImplementedError`` exception will be thrown.
Args:
learning_rate (float): The initial learning rate. It is a python float number.
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns:
instance to schedule learning rate.
Examples:
Here is an example of a simple ``StepDecay`` implementation.
.. code-block:: python
import paddle
form paddle.optimizer.lr import LRScheduler
class StepDecay(LRScheduler):
def __init__(self,
learning_rate,
step_size,
gamma=0.1,
last_epoch=-1,
verbose=False):
if not isinstance(step_size, int):
raise TypeError(
"The type of 'step_size' must be 'int', but received %s." %
type(step_size))
if gamma >= 1.0:
raise ValueError('gamma should be < 1.0.')
self.step_size = step_size
self.gamma = gamma
super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)
def get_lr(self):
i = self.last_epoch // self.step_size
return self.base_lr * (self.gamma**i)
Define the common interface of an LRScheduler.
User can 'form paddle.optimizer.lr_scheduler import _LRScheduler'
And inherit from it to have a custom implementation of get_lr().
"""
def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
......@@ -47,23 +91,22 @@ class _LRScheduler(object):
def __call__(self):
"""
Return last computed learning rate on current epoch.
Return lastest computed learning rate on current epoch.
"""
return self.last_lr
def step(self, epoch=None):
"""
'step' should be called after 'minimize' . It will update the learning rate in optimizer according to 'epoch'.
The new learning rate will take effect on next epoch.
``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
The new learning rate will take effect on next ``optimizer.step`` .
Args:
epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
Returns:
None
Examples:
Please refer to the example of current _LRScheduler.
"""
if epoch is None:
self.last_epoch += 1
......@@ -81,11 +124,12 @@ class _LRScheduler(object):
def state_dict(self):
"""
Returns the state of the scheduler as a :class:`dict`.
It is a subset of self.__dict__ .
It is a subset of ``self.__dict__`` .
"""
self._state_keys()
self.state_keys()
state_dict = {}
for key in self.keys:
if key not in self.__dict__:
......@@ -101,19 +145,26 @@ class _LRScheduler(object):
return state_dict
# For those subclass who overload _LRScheduler, "last_epoch, last_lr" will be saved by default.
# For those subclass who overload LRScheduler, "last_epoch, last_lr" will be saved by default.
# (Note): you can change it for your subclass.
def _state_keys(self):
def state_keys(self):
"""
set the keys in self.__dict__ that are needed to be saved.
For those subclass who overload ``LRScheduler`` (Base Class). Acquiescently, "last_epoch, last_lr" will be saved by ``self.keys = ['last_epoch', 'last_lr']`` .
``last_epoch`` is the current epoch num, and ``last_lr`` is the current learning rate.
If you want to change the default behavior, you should have a custom implementation of ``_state_keys()`` to redefine ``self.keys`` .
"""
self.keys = ['last_epoch', 'last_lr']
def set_state_dict(self, state_dict):
"""
Loads the schedulers state.
"""
self._state_keys()
self.state_keys()
for key in self.keys:
if key in state_dict:
self.__dict__[key] = state_dict[key]
......@@ -130,14 +181,20 @@ class _LRScheduler(object):
set_dict = set_state_dict
def get_lr(self):
"""
For those subclass who overload ``LRScheduler`` (Base Class), User should have a custom implementation of ``get_lr()`` .
Otherwise, an ``NotImplementedError`` exception will be thrown.
"""
# calculate by python float
raise NotImplementedError
class NoamLR(_LRScheduler):
class NoamDecay(LRScheduler):
"""
Applies Noam Lear to the initial learning rate.
Applies Noam Decay to the initial learning rate.
The algorithm can be described as following.
......@@ -156,7 +213,7 @@ class NoamLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns:
``NoamLR`` instance to schedule learning rate.
``NoamDecay`` instance to schedule learning rate.
Examples:
.. code-block:: python
......@@ -164,23 +221,21 @@ class NoamLR(_LRScheduler):
import paddle
import numpy as np
# train on default dygraph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
# train on default dynamic graph mode
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20):
for batch_id in range(2):
x = paddle.to_tensor(x)
x = paddle.uniform([10, 10])
out = linear(x)
loss = paddle.reduce_mean(out)
loss.backward()
sgd.minimize(loss)
linear.clear_gradients()
sgd.step()
sgd.clear_gradients()
scheduler.step()
# train on static mode
# train on static graph mode
paddle.enable_static()
main_prog = paddle.static.Program()
start_prog = paddle.static.Program()
......@@ -189,7 +244,7 @@ class NoamLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True)
scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss)
......@@ -216,7 +271,7 @@ class NoamLR(_LRScheduler):
verbose=False):
self.d_model = d_model
self.warmup_steps = warmup_steps
super(NoamLR, self).__init__(learning_rate, last_epoch, verbose)
super(NoamDecay, self).__init__(learning_rate, last_epoch, verbose)
def get_lr(self):
if self.last_epoch == 0:
......@@ -227,7 +282,7 @@ class NoamLR(_LRScheduler):
return self.base_lr * (self.d_model**-0.5) * min(a, b)
class PiecewiseLR(_LRScheduler):
class PiecewiseDecay(LRScheduler):
"""
Piecewise learning rate scheduler.
......@@ -253,7 +308,7 @@ class PiecewiseLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns:
``PiecewiseLR`` instance to schedule learning rate.
``PiecewiseDecay`` instance to schedule learning rate.
Examples:
......@@ -262,23 +317,21 @@ class PiecewiseLR(_LRScheduler):
import paddle
import numpy as np
# train on default dygraph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
# train on default dynamic graph mode
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20):
for batch_id in range(2):
x = paddle.to_tensor(x)
x = paddle.uniform([10, 10])
out = linear(x)
loss = paddle.reduce_mean(out)
loss.backward()
sgd.minimize(loss)
linear.clear_gradients()
sgd.step()
sgd.clear_gradients()
scheduler.step()
# train on static mode
# train on static graph mode
paddle.enable_static()
main_prog = paddle.static.Program()
start_prog = paddle.static.Program()
......@@ -287,7 +340,7 @@ class PiecewiseLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss)
......@@ -308,7 +361,7 @@ class PiecewiseLR(_LRScheduler):
def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
self.boundaries = boundaries
self.values = values
super(PiecewiseLR, self).__init__(
super(PiecewiseDecay, self).__init__(
last_epoch=last_epoch, verbose=verbose)
def get_lr(self):
......@@ -319,7 +372,7 @@ class PiecewiseLR(_LRScheduler):
return self.values[len(self.values) - 1]
class NaturalExpLR(_LRScheduler):
class NaturalExpDecay(LRScheduler):
"""
Applies natural exponential decay to the initial learning rate.
......@@ -328,7 +381,7 @@ class NaturalExpLR(_LRScheduler):
.. math::
new\_learning\_rate = learning\_rate * e^{- gama * epoch}
new\_learning\_rate = learning\_rate * e^{- gamma * epoch}
Args:
learning_rate (float): The initial learning rate. It is a python float number.
......@@ -337,7 +390,7 @@ class NaturalExpLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns:
``NaturalExpLR`` instance to schedule learning rate.
``NaturalExpDecay`` instance to schedule learning rate.
Examples:
......@@ -346,23 +399,21 @@ class NaturalExpLR(_LRScheduler):
import paddle
import numpy as np
# train on default dygraph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
# train on default dynamic graph mode
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20):
for batch_id in range(2):
x = paddle.to_tensor(x)
x = paddle.uniform([10, 10])
out = linear(x)
loss = paddle.reduce_mean(out)
loss.backward()
sgd.minimize(loss)
linear.clear_gradients()
sgd.step()
sgd.clear_gradients()
scheduler.step()
# train on static mode
# train on static graph mode
paddle.enable_static()
main_prog = paddle.static.Program()
start_prog = paddle.static.Program()
......@@ -371,7 +422,7 @@ class NaturalExpLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True)
scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss)
......@@ -391,13 +442,14 @@ class NaturalExpLR(_LRScheduler):
def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
self.gamma = gamma
super(NaturalExpLR, self).__init__(learning_rate, last_epoch, verbose)
super(NaturalExpDecay, self).__init__(learning_rate, last_epoch,
verbose)
def get_lr(self):
return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch)
class InverseTimeLR(_LRScheduler):
class InverseTimeDecay(LRScheduler):
"""
Applies inverse time decay to the initial learning rate.
......@@ -416,7 +468,7 @@ class InverseTimeLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns:
``InverseTimeLR`` instance to schedule learning rate.
``InverseTimeDecay`` instance to schedule learning rate.
Examples:
......@@ -425,23 +477,21 @@ class InverseTimeLR(_LRScheduler):
import paddle
import numpy as np
# train on default dygraph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
# train on default dynamic graph mode
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20):
for batch_id in range(2):
x = paddle.to_tensor(x)
x = paddle.uniform([10, 10])
out = linear(x)
loss = paddle.reduce_mean(out)
loss.backward()
sgd.minimize(loss)
linear.clear_gradients()
sgd.step()
sgd.clear_gradients()
scheduler.step()
# train on static mode
# train on static graph mode
paddle.enable_static()
main_prog = paddle.static.Program()
start_prog = paddle.static.Program()
......@@ -450,7 +500,7 @@ class InverseTimeLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True)
scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss)
......@@ -471,13 +521,14 @@ class InverseTimeLR(_LRScheduler):
def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
self.gamma = gamma
super(InverseTimeLR, self).__init__(learning_rate, last_epoch, verbose)
super(InverseTimeDecay, self).__init__(learning_rate, last_epoch,
verbose)
def get_lr(self):
return self.base_lr / (1 + self.gamma * self.last_epoch)
class PolynomialLR(_LRScheduler):
class PolynomialDecay(LRScheduler):
"""
Applies polynomial decay to the initial learning rate.
......@@ -512,7 +563,7 @@ class PolynomialLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns:
``PolynomialLR`` instance to schedule learning rate.
``PolynomialDecay`` instance to schedule learning rate.
Examples:
......@@ -521,23 +572,21 @@ class PolynomialLR(_LRScheduler):
import paddle
import numpy as np
# train on default dygraph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
# train on default dynamic graph mode
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20):
for batch_id in range(2):
x = paddle.to_tensor(x)
x = paddle.uniform([10, 10])
out = linear(x)
loss = paddle.reduce_mean(out)
loss.backward()
sgd.minimize(loss)
linear.clear_gradients()
sgd.step()
sgd.clear_gradients()
scheduler.step()
# train on static mode
# train on static graph mode
paddle.enable_static()
main_prog = paddle.static.Program()
start_prog = paddle.static.Program()
......@@ -546,7 +595,7 @@ class PolynomialLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True)
scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss)
......@@ -576,7 +625,8 @@ class PolynomialLR(_LRScheduler):
self.end_lr = end_lr
self.power = power
self.cycle = cycle
super(PolynomialLR, self).__init__(learning_rate, last_epoch, verbose)
super(PolynomialDecay, self).__init__(learning_rate, last_epoch,
verbose)
def get_lr(self):
tmp_epoch_num = self.last_epoch
......@@ -596,7 +646,7 @@ class PolynomialLR(_LRScheduler):
)**self.power) + self.end_lr
class LinearLrWarmup(_LRScheduler):
class LinearWarmup(LRScheduler):
"""
Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
......@@ -604,22 +654,22 @@ class LinearLrWarmup(_LRScheduler):
When epoch < warmup_steps, learning rate is updated as:
.. code-block:: text
.. math::
lr = start_lr + (end_lr - start_lr) * (epoch / warmup_steps)
lr = start\_lr + (end\_lr - start\_lr) * \\frac{epoch}{warmup\_steps}
where start_lr is the initial learning rate, and end_lr is the final learning rate;
When epoch >= warmup_steps, learning rate is updated as:
.. code-block:: text
.. math::
lr = learning_rate
where lr is float or any subclass of ``_LRScheduler`` .
where ``learning_rate`` is float or any subclass of ``LRScheduler`` .
Args:
learning_rate (float|_LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``_LRScheduler`` .
learning_rate (float|LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``LRScheduler`` .
warmup_steps (int): total steps of warm up.
start_lr (float): Initial learning rate of warm up.
end_lr (float): Final learning rate of warm up.
......@@ -627,7 +677,7 @@ class LinearLrWarmup(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns:
``LinearLrWarmup`` instance to schedule learning rate.
``LinearWarmup`` instance to schedule learning rate.
Examples:
......@@ -636,24 +686,22 @@ class LinearLrWarmup(_LRScheduler):
import paddle
import numpy as np
# train on default dygraph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
# train on default dynamic graph mode
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.LinearLrWarmup(
scheduler = paddle.optimizer.lr.LinearWarmup(
learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20):
for batch_id in range(2):
x = paddle.to_tensor(x)
x = paddle.uniform([10, 10])
out = linear(x)
loss = paddle.reduce_mean(out)
loss.backward()
sgd.minimize(loss)
linear.clear_gradients()
sgd.step()
sgd.clear_gradients()
scheduler.step()
# train on static mode
# train on static graph mode
paddle.enable_static()
main_prog = paddle.static.Program()
start_prog = paddle.static.Program()
......@@ -662,7 +710,7 @@ class LinearLrWarmup(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.LinearLrWarmup(
scheduler = paddle.optimizer.lr.LinearWarmup(
learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss)
......@@ -678,7 +726,7 @@ class LinearLrWarmup(_LRScheduler):
'y': np.random.randn(3, 4, 5).astype('float32')
},
fetch_list=loss.name)
scheduler.step()
scheduler.step()
"""
def __init__(self,
......@@ -689,10 +737,10 @@ class LinearLrWarmup(_LRScheduler):
last_epoch=-1,
verbose=False):
type_check = isinstance(learning_rate, float) or isinstance(
learning_rate, int) or isinstance(learning_rate, _LRScheduler)
learning_rate, int) or isinstance(learning_rate, LRScheduler)
if not type_check:
raise TypeError(
"the type of learning_rate should be [int, float or _LRScheduler], the current type is {}".
"the type of learning_rate should be [int, float or LRScheduler], the current type is {}".
format(learning_rate))
self.learning_rate = learning_rate
self.warmup_steps = warmup_steps
......@@ -700,24 +748,24 @@ class LinearLrWarmup(_LRScheduler):
self.end_lr = end_lr
assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(
end_lr, start_lr)
super(LinearLrWarmup, self).__init__(start_lr, last_epoch, verbose)
super(LinearWarmup, self).__init__(start_lr, last_epoch, verbose)
def get_lr(self):
if self.last_epoch < self.warmup_steps:
return (self.end_lr - self.start_lr) * float(
self.last_epoch) / float(self.warmup_steps) + self.start_lr
else:
if isinstance(self.learning_rate, _LRScheduler):
if isinstance(self.learning_rate, LRScheduler):
self.learning_rate.step()
return self.learning_rate()
return self.learning_rate
class ExponentialLR(_LRScheduler):
class ExponentialDecay(LRScheduler):
"""
Update learning rate by 'gamma' each epoch.
Update learning rate by `gamma` each epoch.
The algorithm can be described as following.
......@@ -733,7 +781,7 @@ class ExponentialLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns:
``ExponentialLR`` instance to schedule learning rate.
``ExponentialDecay`` instance to schedule learning rate.
Examples:
......@@ -742,23 +790,21 @@ class ExponentialLR(_LRScheduler):
import paddle
import numpy as np
# train on default dygraph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
# train on default dynamic graph mode
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20):
for batch_id in range(2):
x = paddle.to_tensor(x)
x = paddle.uniform([10, 10])
out = linear(x)
loss = paddle.reduce_mean(out)
loss.backward()
sgd.minimize(loss)
linear.clear_gradients()
sgd.step()
sgd.clear_gradients()
scheduler.step()
# train on static mode
# train on static graph mode
paddle.enable_static()
main_prog = paddle.static.Program()
start_prog = paddle.static.Program()
......@@ -767,7 +813,7 @@ class ExponentialLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True)
scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss)
......@@ -787,15 +833,16 @@ class ExponentialLR(_LRScheduler):
def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
self.gamma = gamma
super(ExponentialLR, self).__init__(learning_rate, last_epoch, verbose)
super(ExponentialDecay, self).__init__(learning_rate, last_epoch,
verbose)
def get_lr(self):
return self.base_lr * (self.gamma**self.last_epoch)
class MultiStepLR(_LRScheduler):
class MultiStepDecay(LRScheduler):
"""
Update the learning rate by ``gama`` once ``epoch`` reaches one of the milestones.
Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.
The algorithm can be described as the code below.
......@@ -821,7 +868,7 @@ class MultiStepLR(_LRScheduler):
Returns:
``MultiStepLR`` instance to schedule learning rate.
``MultiStepDecay`` instance to schedule learning rate.
Examples:
......@@ -830,23 +877,21 @@ class MultiStepLR(_LRScheduler):
import paddle
import numpy as np
# train on default dygraph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
# train on default dynamic graph mode
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20):
for batch_id in range(2):
x = paddle.to_tensor(x)
x = paddle.uniform([10, 10])
out = linear(x)
loss = paddle.reduce_mean(out)
loss.backward()
sgd.minimize(loss)
linear.clear_gradients()
sgd.step()
sgd.clear_gradients()
scheduler.step()
# train on static mode
# train on static graph mode
paddle.enable_static()
main_prog = paddle.static.Program()
start_prog = paddle.static.Program()
......@@ -855,7 +900,7 @@ class MultiStepLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss)
......@@ -894,7 +939,7 @@ class MultiStepLR(_LRScheduler):
self.milestones = milestones
self.gamma = gamma
super(MultiStepLR, self).__init__(learning_rate, last_epoch, verbose)
super(MultiStepDecay, self).__init__(learning_rate, last_epoch, verbose)
def get_lr(self):
for i in range(len(self.milestones)):
......@@ -903,7 +948,7 @@ class MultiStepLR(_LRScheduler):
return self.base_lr * (self.gamma**len(self.milestones))
class StepLR(_LRScheduler):
class StepDecay(LRScheduler):
"""
Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch.
......@@ -929,7 +974,7 @@ class StepLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns:
``StepLR`` instance to schedule learning rate.
``StepDecay`` instance to schedule learning rate.
Examples:
......@@ -939,23 +984,21 @@ class StepLR(_LRScheduler):
import paddle
import numpy as np
# train on default dygraph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
# train on default dynamic graph mode
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20):
for batch_id in range(2):
x = paddle.to_tensor(x)
x = paddle.uniform([10, 10])
out = linear(x)
loss = paddle.reduce_mean(out)
loss.backward()
sgd.minimize(loss)
linear.clear_gradients()
sgd.step()
sgd.clear_gradients()
scheduler.step()
# train on static mode
# train on static graph mode
paddle.enable_static()
main_prog = paddle.static.Program()
start_prog = paddle.static.Program()
......@@ -964,7 +1007,7 @@ class StepLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss)
......@@ -997,14 +1040,14 @@ class StepLR(_LRScheduler):
self.step_size = step_size
self.gamma = gamma
super(StepLR, self).__init__(learning_rate, last_epoch, verbose)
super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)
def get_lr(self):
i = self.last_epoch // self.step_size
return self.base_lr * (self.gamma**i)
class LambdaLR(_LRScheduler):
class LambdaDecay(LRScheduler):
"""
Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .
......@@ -1015,9 +1058,9 @@ class LambdaLR(_LRScheduler):
learning_rate = 0.5 # init learning_rate
lr_lambda = lambda epoch: 0.95 ** epoch
learning_rate = 0.5 # epoch 0
learning_rate = 0.475 # epoch 1
learning_rate = 0.45125 # epoch 2
learning_rate = 0.5 # epoch 0, 0.5*0.95**0
learning_rate = 0.475 # epoch 1, 0.5*0.95**1
learning_rate = 0.45125 # epoch 2, 0.5*0.95**2
Args:
learning_rate (float): The initial learning rate. It is a python float number.
......@@ -1026,7 +1069,7 @@ class LambdaLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns:
``LambdaLR`` instance to schedule learning rate.
``LambdaDecay`` instance to schedule learning rate.
Examples:
......@@ -1035,23 +1078,21 @@ class LambdaLR(_LRScheduler):
import paddle
import numpy as np
# train on default dygraph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
# train on default dynamic graph mode
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20):
for batch_id in range(2):
x = paddle.to_tensor(x)
x = paddle.uniform([10, 10])
out = linear(x)
loss = paddle.reduce_mean(out)
loss.backward()
sgd.minimize(loss)
linear.clear_gradients()
sgd.step()
sgd.clear_gradients()
scheduler.step()
# train on static mode
# train on static graph mode
paddle.enable_static()
main_prog = paddle.static.Program()
start_prog = paddle.static.Program()
......@@ -1060,7 +1101,7 @@ class LambdaLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss)
......@@ -1082,17 +1123,17 @@ class LambdaLR(_LRScheduler):
def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
if not callable(lr_lambda):
raise TypeError(
"The type of 'lr_lambda' in 'LambdaLR' must be 'function', but received %s."
"The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s."
% type(lr_lambda))
self.lr_lambda = lr_lambda
super(LambdaLR, self).__init__(learning_rate, last_epoch, verbose)
super(LambdaDecay, self).__init__(learning_rate, last_epoch, verbose)
def get_lr(self):
return self.base_lr * self.lr_lambda(self.last_epoch)
class ReduceLROnPlateau(_LRScheduler):
class ReduceOnPlateau(LRScheduler):
"""
Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate
by 2 to 10 times once model performance has no longer improvement.
......@@ -1126,7 +1167,7 @@ class ReduceLROnPlateau(_LRScheduler):
Returns:
``ReduceLROnPlateau`` instance to schedule learning rate.
``ReduceOnPlateau`` instance to schedule learning rate.
Examples:
......@@ -1135,23 +1176,21 @@ class ReduceLROnPlateau(_LRScheduler):
import paddle
import numpy as np
# train on default dygraph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
# train on default dynamic graph mode
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20):
for batch_id in range(2):
x = paddle.to_tensor(x)
x = paddle.uniform([10, 10])
out = linear(x)
loss = paddle.reduce_mean(out)
loss.backward()
sgd.minimize(loss)
linear.clear_gradients()
sgd.step()
sgd.clear_gradients()
scheduler.step(loss)
# train on static mode
# train on static graph mode
paddle.enable_static()
main_prog = paddle.static.Program()
start_prog = paddle.static.Program()
......@@ -1160,7 +1199,7 @@ class ReduceLROnPlateau(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss)
......@@ -1207,7 +1246,7 @@ class ReduceLROnPlateau(_LRScheduler):
self.threshold_mode = threshold_mode
if not isinstance(learning_rate, (float, int)):
raise TypeError(
"The type of 'learning_rate' in 'ReduceLROnPlateau' must be 'float', but received %s."
"The type of 'learning_rate' in 'ReduceOnPlateau' must be 'float', but received %s."
% type(learning_rate))
self.verbose = verbose
......@@ -1230,7 +1269,7 @@ class ReduceLROnPlateau(_LRScheduler):
self._var_name = None
# "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored.
def _state_keys(self):
def state_keys(self):
self.keys = [
'cooldown_counter', 'best', 'num_bad_epochs', 'last_epoch',
'last_lr'
......@@ -1238,7 +1277,7 @@ class ReduceLROnPlateau(_LRScheduler):
def step(self, metrics, epoch=None):
"""
step should be called after 'minimize' . It will update the learning rate in optimizer according to ``metrics`` .
step should be called after `optimizer.step()` . It will update the learning rate in optimizer according to ``metrics`` .
The new learning rate will take effect on next epoch.
Args:
......@@ -1251,14 +1290,14 @@ class ReduceLROnPlateau(_LRScheduler):
None
Examples:
Please refer to the example of current _LRScheduler.
Please refer to the example of current LRScheduler.
"""
if epoch is None:
self.last_epoch = self.last_epoch + 1
else:
self.last_epoch = epoch
# loss must be 1-D Tensor with shape [1]
# loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
if isinstance(metrics, (Tensor, numpy.ndarray)):
assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
"should be (1L,), but the current metrics.shape is {}. Maybe that " \
......@@ -1290,7 +1329,6 @@ class ReduceLROnPlateau(_LRScheduler):
self.last_lr))
def _is_better(self, current, best):
print("mode", self.mode, 'threshold_mode', self.threshold_mode)
if self.mode == 'min' and self.threshold_mode == 'rel':
return current < best - best * self.threshold
......@@ -1304,31 +1342,23 @@ class ReduceLROnPlateau(_LRScheduler):
return current > best + self.threshold
class CosineAnnealingLR(_LRScheduler):
class CosineAnnealingDecay(LRScheduler):
"""
Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to
the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in
SGDR:
\begin{aligned}
\eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+ \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
& T_{cur} \neq (2k+1)T_{max}; \\
\eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
\left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
& T_{cur} = (2k+1)T_{max}.
\end{aligned}
SGDR.
The algorithm can be described as following.
.. math::
\begin{aligned}
\eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+ \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
& T_{cur} \neq (2k+1)T_{max}; \\
\eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
\left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
\\begin{aligned}
\eta_t & = \eta_{min} + \\frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+ \cos\left(\\frac{T_{cur}}{T_{max}}\pi\\right)\\right),
& T_{cur} \\neq (2k+1)T_{max}; \\
\eta_{t+1} & = \eta_{t} + \\frac{1}{2}(\eta_{max} - \eta_{min})
\left(1 - \cos\left(\\frac{1}{T_{max}}\pi\\right)\\right),
& T_{cur} = (2k+1)T_{max}.
\end{aligned}
......@@ -1343,7 +1373,7 @@ class CosineAnnealingLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns:
``CosineAnnealingLR`` instance to schedule learning rate.
``CosineAnnealingDecay`` instance to schedule learning rate.
Examples:
......@@ -1352,23 +1382,21 @@ class CosineAnnealingLR(_LRScheduler):
import paddle
import numpy as np
# train on default dygraph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
# train on default dynamic graph mode
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20):
for batch_id in range(2):
x = paddle.to_tensor(x)
x = paddle.uniform([10, 10])
out = linear(x)
loss = paddle.reduce_mean(out)
loss.backward()
sgd.minimize(loss)
linear.clear_gradients()
sgd.step()
sgd.clear_gradients()
scheduler.step()
# train on static mode
# train on static graph mode
paddle.enable_static()
main_prog = paddle.static.Program()
start_prog = paddle.static.Program()
......@@ -1377,7 +1405,7 @@ class CosineAnnealingLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True)
scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss)
......@@ -1403,16 +1431,16 @@ class CosineAnnealingLR(_LRScheduler):
verbose=False):
if not isinstance(T_max, int):
raise TypeError(
"The type of 'T_max' in 'CosineAnnealingLR' must be 'int', but received %s."
"The type of 'T_max' in 'CosineAnnealingDecay' must be 'int', but received %s."
% type(T_max))
if not isinstance(eta_min, (float, int)):
raise TypeError(
"The type of 'eta_min' in 'CosineAnnealingLR' must be 'float, int', but received %s."
"The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s."
% type(eta_min))
self.T_max = T_max
self.eta_min = float(eta_min)
super(CosineAnnealingLR, self).__init__(learning_rate, last_epoch,
verbose)
super(CosineAnnealingDecay, self).__init__(learning_rate, last_epoch,
verbose)
def get_lr(self):
if self.last_epoch == 0:
......
......@@ -41,7 +41,7 @@ from paddle.fluid.layers import tensor
from functools import reduce
from ..fluid.wrapped_decorator import signature_safe_contextmanager
from .. import compat as cpt
from .lr_scheduler import _LRScheduler
from .lr import LRScheduler
__all__ = ['Optimizer']
......@@ -54,8 +54,8 @@ class Optimizer(object):
but need to use one of it's implementation.
Args:
learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``.
It can be a float value or any subclass of ``_LRScheduler`` .
learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
It can be a float value or any subclass of ``LRScheduler`` .
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
......@@ -82,12 +82,8 @@ class Optimizer(object):
#Take the subclass adam as an example
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear(inp)
loss = paddle.mean(out)
adam = paddle.optimizer.Adam(learning_rate=0.1,
......@@ -121,9 +117,9 @@ class Optimizer(object):
"The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
% weight_decay.__str__())
break
if not isinstance(learning_rate, (float, _LRScheduler)):
if not isinstance(learning_rate, (float, LRScheduler)):
raise TypeError(
"learning rate should be float or _LRScheduler, got %s here" %
"learning rate should be float or LRScheduler, got %s here" %
type(learning_rate))
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
......@@ -156,7 +152,7 @@ class Optimizer(object):
@framework.dygraph_only
def state_dict(self):
'''
Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be include in state dict.
Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be include in state dict.
If the optimizer never be called(minimize function), the state_dict is empty.
Args:
......@@ -169,7 +165,6 @@ class Optimizer(object):
.. code-block:: python
import paddle
paddle.disable_static()
emb = paddle.nn.Embedding(10, 10)
adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
......@@ -181,14 +176,14 @@ class Optimizer(object):
for para_name, var_tmp in v.items():
state_dict[var_tmp.name] = var_tmp
# global step if use lr decay
if isinstance(self._learning_rate, _LRScheduler):
if isinstance(self._learning_rate, LRScheduler):
state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
return state_dict
@framework.dygraph_only
def set_state_dict(self, state_dict):
'''
Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be changed.
Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be changed.
Args:
state_dict(dict) : Dict contains all the Tensor needed by optimizer
......@@ -199,26 +194,28 @@ class Optimizer(object):
.. code-block:: python
import paddle
paddle.disable_static()
emb = paddle.nn.Embedding(10, 10)
state_dict = emb.state_dict()
paddle.framework.save(state_dict, "paddle_dy")
emb = paddle.nn.Embedding(10, 10)
adam = paddle.optimizer.Adam(learning_rate=paddle.optimizer.NoamLR( 100, 10000),
parameters=emb.parameters())
state_dict = adam.state_dict()
paddle.framework.save(state_dict, "paddle_dy")
layer_state_dict = emb.state_dict()
paddle.save(layer_state_dict, "emb.pdparams")
para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam(
learning_rate=scheduler,
parameters=emb.parameters())
opt_state_dict = adam.state_dict()
paddle.save(opt_state_dict, "adam.pdopt")
opti_state_dict = paddle.load("adam.pdopt")
adam.set_state_dict(opti_state_dict)
'''
if isinstance(self._learning_rate, _LRScheduler):
if isinstance(self._learning_rate, LRScheduler):
self._learning_rate.set_dict(state_dict["LR_Scheduler"])
if isinstance(self._learning_rate, _LRScheduler):
if isinstance(self._learning_rate, LRScheduler):
self._learning_rate.set_state_dict(state_dict["LR_Scheduler"])
self._accumulators_holder = state_dict
......@@ -256,7 +253,7 @@ class Optimizer(object):
return self._opti_name_list
def _create_global_learning_rate(self):
if isinstance(self._learning_rate, _LRScheduler):
if isinstance(self._learning_rate, LRScheduler):
lr_var = self._global_learning_rate()
# only create global lr_var once
if not isinstance(lr_var, framework.Variable):
......@@ -299,7 +296,7 @@ class Optimizer(object):
"""
:api_attr: imperative
Set the value of the learning rate manually in the optimizer. If the optimizer use _LRScheduler,
Set the value of the learning rate manually in the optimizer. If the optimizer use LRScheduler,
this API cannot be invoked, because it will lead to conflict.
Args:
......@@ -312,7 +309,6 @@ class Optimizer(object):
.. code-block:: python
import paddle
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
......@@ -335,9 +331,9 @@ class Optimizer(object):
raise TypeError(
"The type of 'value' in optimizer.set_lr must be float, but received %s."
% (type(value)))
if isinstance(self._learning_rate, _LRScheduler):
if isinstance(self._learning_rate, LRScheduler):
raise RuntimeError(
"optimizer's learning rate can't be _LRScheduler when invoke this API, because this will lead to conflict."
"optimizer's learning rate can't be LRScheduler when invoke this API, because this will lead to conflict."
)
self._learning_rate = float(value)
current_lr = self._global_learning_rate()
......@@ -358,7 +354,7 @@ class Optimizer(object):
"""
:api_attr: imperative
Get current step learning rate. The return value is all the same When _LRScheduler is not used,
Get current step learning rate. The return value is all the same When LRScheduler is not used,
otherwise return the current step learning rate.
......@@ -370,15 +366,13 @@ class Optimizer(object):
import numpy as np
import paddle
# example1: _LRScheduler is not used, return value is all the same
paddle.disable_static()
# example1: LRScheduler is not used, return value is all the same
emb = paddle.nn.Embedding(10, 10)
adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters())
lr = adam.get_lr()
print(lr) # 0.001
# example2: PiecewiseLR is used, return the step learning rate
paddle.disable_static()
# example2: PiecewiseDecay is used, return the scheduled learning rate
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
......@@ -387,7 +381,7 @@ class Optimizer(object):
bd = [2, 4, 6, 8]
value = [0.2, 0.4, 0.6, 0.8, 1.0]
scheduler = paddle.optimizer.PiecewiseLR(bd, value, 0)
scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value, 0)
adam = paddle.optimizer.Adam(scheduler,
parameters=linear.parameters())
......@@ -656,7 +650,6 @@ class Optimizer(object):
import paddle
import numpy as np
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
......@@ -727,7 +720,6 @@ class Optimizer(object):
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
......@@ -805,7 +797,7 @@ class Optimizer(object):
import numpy as np
import paddle
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
......@@ -854,13 +846,9 @@ class Optimizer(object):
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
input = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear(input)
loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
......@@ -903,7 +891,7 @@ class Optimizer(object):
import paddle
import numpy as np
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
......
......@@ -69,8 +69,8 @@ class RMSProp(Optimizer):
Parameters:
learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler.
learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
It can be a float value or a LRScheduler.
rho(float): rho is :math: `\\rho` in equation, default is 0.95.
epsilon(float): :math: `\\epsilon` in equation is smoothing term to
avoid division by zero, default is 1e-6.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册