未验证 提交 e122e164 编写于 作者: Z Zhou Wei 提交者: GitHub

fix english doc, unittest, and remove useless alias of 2.0 lr_scheduler (#27686)

* fix doc and unittest of 2.0 lr_scheduler

* fix doc of 2.0 lr_scheduler

* fix unittest

* fix english doc of lr_scheduler

* fix api name of lr scheduler

* fix api name of lr scheduler
上级 9215ad96
...@@ -237,13 +237,6 @@ from .framework import save #DEFINE_ALIAS ...@@ -237,13 +237,6 @@ from .framework import save #DEFINE_ALIAS
from .framework import load #DEFINE_ALIAS from .framework import load #DEFINE_ALIAS
from .framework import DataParallel #DEFINE_ALIAS from .framework import DataParallel #DEFINE_ALIAS
from .framework import NoamDecay #DEFINE_ALIAS
from .framework import PiecewiseDecay #DEFINE_ALIAS
from .framework import NaturalExpDecay #DEFINE_ALIAS
from .framework import ExponentialDecay #DEFINE_ALIAS
from .framework import InverseTimeDecay #DEFINE_ALIAS
from .framework import PolynomialDecay #DEFINE_ALIAS
from .framework import CosineDecay #DEFINE_ALIAS
from .framework import set_default_dtype #DEFINE_ALIAS from .framework import set_default_dtype #DEFINE_ALIAS
from .framework import get_default_dtype #DEFINE_ALIAS from .framework import get_default_dtype #DEFINE_ALIAS
......
...@@ -164,7 +164,7 @@ def load_dygraph(model_path, **configs): ...@@ -164,7 +164,7 @@ def load_dygraph(model_path, **configs):
state_dict = emb.state_dict() state_dict = emb.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy") fluid.save_dygraph(state_dict, "paddle_dy")
scheduler = paddle.optimizer.lr_scheduler.NoamLR( scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True) d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
learning_rate=scheduler, learning_rate=scheduler,
......
...@@ -855,7 +855,7 @@ class Executor(object): ...@@ -855,7 +855,7 @@ class Executor(object):
def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name, def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
return_numpy, return_merged): return_numpy, return_merged):
from paddle.optimizer.lr_scheduler import _LRScheduler from paddle.optimizer.lr import LRScheduler
exe = program._executor exe = program._executor
# TODO(zhenghuihuang): quantization uses Graph in CompiledProgram # TODO(zhenghuihuang): quantization uses Graph in CompiledProgram
# instead of program. We will add support for checking Vars in Graph # instead of program. We will add support for checking Vars in Graph
...@@ -901,7 +901,7 @@ class Executor(object): ...@@ -901,7 +901,7 @@ class Executor(object):
if hasattr(program._program, 'lr_sheduler'): if hasattr(program._program, 'lr_sheduler'):
lr_sheduler = program._program.lr_sheduler lr_sheduler = program._program.lr_sheduler
assert isinstance(lr_sheduler, _LRScheduler), "must be _LRScheduler" assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
lr_value = lr_sheduler() lr_value = lr_sheduler()
lr_var = program._program.global_block().vars[lr_sheduler._var_name] lr_var = program._program.global_block().vars[lr_sheduler._var_name]
lr_tensor = _as_lodtensor(lr_value, core.CPUPlace(), lr_var.dtype) lr_tensor = _as_lodtensor(lr_value, core.CPUPlace(), lr_var.dtype)
...@@ -1238,7 +1238,7 @@ class Executor(object): ...@@ -1238,7 +1238,7 @@ class Executor(object):
def _run_program(self, program, feed, fetch_list, feed_var_name, def _run_program(self, program, feed, fetch_list, feed_var_name,
fetch_var_name, scope, return_numpy, use_program_cache): fetch_var_name, scope, return_numpy, use_program_cache):
from paddle.optimizer.lr_scheduler import _LRScheduler from paddle.optimizer.lr import LRScheduler
if feed is None: if feed is None:
feed = {} feed = {}
elif isinstance(feed, (list, tuple)): elif isinstance(feed, (list, tuple)):
...@@ -1296,7 +1296,7 @@ class Executor(object): ...@@ -1296,7 +1296,7 @@ class Executor(object):
self._feed_data(program, feed, feed_var_name, scope) self._feed_data(program, feed, feed_var_name, scope)
if hasattr(program, 'lr_sheduler'): if hasattr(program, 'lr_sheduler'):
assert isinstance(program.lr_sheduler, assert isinstance(program.lr_sheduler,
_LRScheduler), "must be _LRScheduler" LRScheduler), "must be LRScheduler"
lr_sheduler = program.lr_sheduler lr_sheduler = program.lr_sheduler
lr_value = lr_sheduler() lr_value = lr_sheduler()
lr_var = program.global_block().vars[lr_sheduler._var_name] lr_var = program.global_block().vars[lr_sheduler._var_name]
......
...@@ -70,15 +70,15 @@ class Optimizer(object): ...@@ -70,15 +70,15 @@ class Optimizer(object):
grad_clip=None, grad_clip=None,
name=None): name=None):
# Because of the loop import, so place it in the function body # Because of the loop import, so place it in the function body
from paddle.optimizer.lr_scheduler import _LRScheduler from paddle.optimizer.lr import LRScheduler
self._parameter_list = list( self._parameter_list = list(
parameter_list) if parameter_list is not None else None parameter_list) if parameter_list is not None else None
self._name = name self._name = name
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
if not isinstance(learning_rate, if not isinstance(learning_rate,
(float, LearningRateDecay, _LRScheduler)): (float, LearningRateDecay, LRScheduler)):
raise TypeError( raise TypeError(
"learning rate should be float or _LRScheduler, got %s here" "learning rate should be float or LRScheduler, got %s here"
% type(learning_rate)) % type(learning_rate))
if self._parameter_list is None: if self._parameter_list is None:
raise AttributeError( raise AttributeError(
...@@ -94,9 +94,9 @@ class Optimizer(object): ...@@ -94,9 +94,9 @@ class Optimizer(object):
break break
else: else:
if not isinstance(learning_rate, if not isinstance(learning_rate,
(float, framework.Variable, _LRScheduler)): (float, framework.Variable, LRScheduler)):
raise TypeError( raise TypeError(
"learning rate should be float or _LRScheduler, got %s here" "learning rate should be float or LRScheduler, got %s here"
% type(learning_rate)) % type(learning_rate))
if grad_clip is not None: if grad_clip is not None:
...@@ -147,13 +147,13 @@ class Optimizer(object): ...@@ -147,13 +147,13 @@ class Optimizer(object):
state_dict = adam.state_dict() state_dict = adam.state_dict()
''' '''
from paddle.optimizer.lr_scheduler import _LRScheduler from paddle.optimizer.lr import LRScheduler
state_dict = {} state_dict = {}
for k, v in self._accumulators.items(): for k, v in self._accumulators.items():
for para_name, var_tmp in v.items(): for para_name, var_tmp in v.items():
state_dict[var_tmp.name] = var_tmp state_dict[var_tmp.name] = var_tmp
# global step if use lr decay # global step if use lr decay
if isinstance(self._learning_rate, _LRScheduler): if isinstance(self._learning_rate, LRScheduler):
state_dict["LR_Scheduler"] = self._learning_rate.state_dict() state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
return state_dict return state_dict
if isinstance(self._learning_rate, LearningRateDecay): if isinstance(self._learning_rate, LearningRateDecay):
...@@ -193,7 +193,7 @@ class Optimizer(object): ...@@ -193,7 +193,7 @@ class Optimizer(object):
state_dict = emb.state_dict() state_dict = emb.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy") fluid.save_dygraph(state_dict, "paddle_dy")
scheduler = paddle.optimizer.lr_scheduler.NoamLR( scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True) d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
learning_rate=scheduler, learning_rate=scheduler,
...@@ -203,8 +203,8 @@ class Optimizer(object): ...@@ -203,8 +203,8 @@ class Optimizer(object):
para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy") para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
''' '''
from paddle.optimizer.lr_scheduler import _LRScheduler from paddle.optimizer.lr import LRScheduler
if isinstance(self._learning_rate, _LRScheduler): if isinstance(self._learning_rate, LRScheduler):
self._learning_rate.set_dict(state_dict["LR_Scheduler"]) self._learning_rate.set_dict(state_dict["LR_Scheduler"])
if isinstance(self._learning_rate, LearningRateDecay): if isinstance(self._learning_rate, LearningRateDecay):
...@@ -269,8 +269,8 @@ class Optimizer(object): ...@@ -269,8 +269,8 @@ class Optimizer(object):
return self._opti_name_list return self._opti_name_list
def _create_global_learning_rate(self): def _create_global_learning_rate(self):
from paddle.optimizer.lr_scheduler import _LRScheduler from paddle.optimizer.lr import LRScheduler
if isinstance(self._learning_rate, _LRScheduler): if isinstance(self._learning_rate, LRScheduler):
lr_var = self._global_learning_rate() lr_var = self._global_learning_rate()
# only create global lr_var once # only create global lr_var once
if not isinstance(lr_var, framework.Variable): if not isinstance(lr_var, framework.Variable):
......
...@@ -455,8 +455,8 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -455,8 +455,8 @@ class TestAdamOpV2(unittest.TestCase):
state_dict = adam.state_dict() state_dict = adam.state_dict()
adam.set_state_dict(state_dict) adam.set_state_dict(state_dict)
#learning_rate is _LRScheduler #learning_rate is LRScheduler
learning_rate = paddle.optimizer.CosineAnnealingLR( learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(
learning_rate=0.1, T_max=10) learning_rate=0.1, T_max=10)
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
learning_rate=learning_rate, learning_rate=learning_rate,
......
...@@ -43,14 +43,22 @@ class TestDirectory(unittest.TestCase): ...@@ -43,14 +43,22 @@ class TestDirectory(unittest.TestCase):
'paddle.distributed.prepare_context', 'paddle.DataParallel', 'paddle.distributed.prepare_context', 'paddle.DataParallel',
'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static', 'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer', 'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
'paddle.jit.save', 'paddle.jit.load', 'paddle.NoamDecay', 'paddle.jit.save', 'paddle.jit.load',
'paddle.PiecewiseDecay', 'paddle.NaturalExpDecay', 'paddle.optimizer.lr.LRScheduler', 'paddle.optimizer.lr.NoamDecay',
'paddle.ExponentialDecay', 'paddle.InverseTimeDecay', 'paddle.optimizer.lr.PiecewiseDecay',
'paddle.PolynomialDecay', 'paddle.CosineDecay', 'paddle.optimizer.lr.NaturalExpDecay',
'paddle.static.Executor', 'paddle.static.global_scope', 'paddle.optimizer.lr.ExponentialDecay',
'paddle.static.scope_guard', 'paddle.static.append_backward', 'paddle.optimizer.lr.InverseTimeDecay',
'paddle.static.gradients', 'paddle.static.BuildStrategy', 'paddle.optimizer.lr.PolynomialDecay',
'paddle.static.CompiledProgram', 'paddle.static.ExecutionStrategy', 'paddle.optimizer.lr.CosineAnnealingDecay',
'paddle.optimizer.lr.MultiStepDecay',
'paddle.optimizer.lr.StepDecay', 'paddle.optimizer.lr.LambdaDecay',
'paddle.optimizer.lr.ReduceOnPlateau',
'paddle.optimizer.lr.LinearWarmup', 'paddle.static.Executor',
'paddle.static.global_scope', 'paddle.static.scope_guard',
'paddle.static.append_backward', 'paddle.static.gradients',
'paddle.static.BuildStrategy', 'paddle.static.CompiledProgram',
'paddle.static.ExecutionStrategy',
'paddle.static.default_main_program', 'paddle.static.default_main_program',
'paddle.static.default_startup_program', 'paddle.static.Program', 'paddle.static.default_startup_program', 'paddle.static.Program',
'paddle.static.name_scope', 'paddle.static.program_guard', 'paddle.static.name_scope', 'paddle.static.program_guard',
......
...@@ -23,7 +23,7 @@ import itertools ...@@ -23,7 +23,7 @@ import itertools
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer, Adam, MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer from paddle.fluid.optimizer import MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer
from paddle.fluid.optimizer import ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer from paddle.fluid.optimizer import ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer
from paddle.fluid.dygraph import Linear from paddle.fluid.dygraph import Linear
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
...@@ -72,15 +72,17 @@ class TestImperativeOptimizerBase(unittest.TestCase): ...@@ -72,15 +72,17 @@ class TestImperativeOptimizerBase(unittest.TestCase):
place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace() ) else fluid.CPUPlace()
with fluid.dygraph.guard(place): try:
try: paddle.disable_static()
paddle.manual_seed(seed) paddle.manual_seed(seed)
paddle.framework.random._manual_program_seed(seed) paddle.framework.random._manual_program_seed(seed)
mlp = MLP() mlp = MLP()
optimizer = self.get_optimizer_dygraph( optimizer = self.get_optimizer_dygraph(
parameter_list=mlp.parameters()) parameter_list=mlp.parameters())
except Exception as e: except Exception as e:
assert str(e) == exception_message assert str(e) == exception_message
finally:
paddle.enable_static()
def _check_mlp(self, place=None): def _check_mlp(self, place=None):
seed = 90 seed = 90
...@@ -90,47 +92,55 @@ class TestImperativeOptimizerBase(unittest.TestCase): ...@@ -90,47 +92,55 @@ class TestImperativeOptimizerBase(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda( place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0) ) else fluid.CUDAPlace(0)
with fluid.dygraph.guard(place): paddle.disable_static(place)
paddle.manual_seed(seed) paddle.manual_seed(seed)
paddle.framework.random._manual_program_seed(seed) paddle.framework.random._manual_program_seed(seed)
mlp = MLP() mlp = MLP()
optimizer = self.get_optimizer_dygraph( optimizer = self.get_optimizer_dygraph(parameter_list=mlp.parameters())
parameter_list=mlp.parameters())
batch_py_reader = fluid.io.PyReader(capacity=1) batch_py_reader = fluid.io.PyReader(capacity=1)
batch_py_reader.decorate_sample_list_generator( batch_py_reader.decorate_sample_list_generator(
paddle.batch( paddle.batch(
self.reader_decorator(paddle.dataset.mnist.train()), self.reader_decorator(paddle.dataset.mnist.train()),
batch_size=batch_size, batch_size=batch_size,
drop_last=True), drop_last=True),
places=fluid.CPUPlace()) places=fluid.CPUPlace())
dy_param_init_value = {} dy_param_init_value = {}
for batch_id, data in enumerate(batch_py_reader()): for batch_id, data in enumerate(batch_py_reader()):
if batch_id >= self.batch_num: if batch_id >= self.batch_num:
break break
img = data[0] img = data[0]
label = data[1] label = data[1]
label.stop_gradient = True
img = fluid.layers.reshape(img, shape=[batch_size, -1]) label.stop_gradient = True
cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost)
dy_out = avg_loss.numpy()
if batch_id == 0: img = fluid.layers.reshape(img, shape=[batch_size, -1])
for param in mlp.parameters(): cost = mlp(img)
dy_param_init_value[param.name] = param.numpy() avg_loss = fluid.layers.reduce_mean(cost)
dy_out = avg_loss.numpy()
avg_loss.backward() if batch_id == 0:
optimizer.minimize(avg_loss)
mlp.clear_gradients()
dy_param_value = {}
for param in mlp.parameters(): for param in mlp.parameters():
dy_param_value[param.name] = param.numpy() dy_param_init_value[param.name] = param.numpy()
avg_loss.backward()
optimizer.minimize(avg_loss)
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.LRScheduler):
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.ReduceOnPlateau):
optimizer._learning_rate.step(avg_loss)
else:
optimizer._learning_rate.step()
mlp.clear_gradients()
dy_param_value = {}
for param in mlp.parameters():
dy_param_value[param.name] = param.numpy()
paddle.enable_static()
with new_program_scope(): with new_program_scope():
paddle.manual_seed(seed) paddle.manual_seed(seed)
paddle.framework.random._manual_program_seed(seed) paddle.framework.random._manual_program_seed(seed)
...@@ -181,6 +191,13 @@ class TestImperativeOptimizerBase(unittest.TestCase): ...@@ -181,6 +191,13 @@ class TestImperativeOptimizerBase(unittest.TestCase):
feed={"pixel": static_x_data, feed={"pixel": static_x_data,
"label": y_data}, "label": y_data},
fetch_list=fetch_list) fetch_list=fetch_list)
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.LRScheduler):
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.ReduceOnPlateau):
optimizer._learning_rate.step(out[0])
else:
optimizer._learning_rate.step()
static_param_value = {} static_param_value = {}
static_out = out[0] static_out = out[0]
...@@ -199,17 +216,19 @@ class TestImperativeOptimizerBase(unittest.TestCase): ...@@ -199,17 +216,19 @@ class TestImperativeOptimizerBase(unittest.TestCase):
class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
bd = [3, 6, 9] bd = [3, 6, 9]
optimizer = SGDOptimizer( optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.PiecewiseLR( learning_rate=paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, boundaries=bd,
values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]), values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
parameter_list=parameter_list) parameters=parameter_list)
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
bd = [3, 6, 9] bd = [3, 6, 9]
optimizer = SGDOptimizer(learning_rate=paddle.optimizer.PiecewiseLR( optimizer = paddle.optimizer.SGD(
boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)])) learning_rate=paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd,
values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
return optimizer return optimizer
def test_sgd(self): def test_sgd(self):
...@@ -218,21 +237,16 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): ...@@ -218,21 +237,16 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer( optimizer = paddle.optimizer.SGD(
learning_rate=fluid.layers.natural_exp_decay( learning_rate=paddle.optimizer.lr.NaturalExpDecay(
learning_rate=0.1, learning_rate=0.5, gamma=0.9),
decay_steps=10000, parameters=parameter_list)
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay( optimizer = paddle.optimizer.SGD(
learning_rate=0.1, learning_rate=paddle.optimizer.lr.NaturalExpDecay(
decay_steps=10000, learning_rate=0.5, gamma=0.9))
decay_rate=0.5,
staircase=True))
return optimizer return optimizer
def test_sgd(self): def test_sgd(self):
...@@ -241,21 +255,16 @@ class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): ...@@ -241,21 +255,16 @@ class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer( optimizer = paddle.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay( learning_rate=paddle.optimizer.lr.ExponentialDecay(
learning_rate=0.1, learning_rate=0.5, gamma=0.9),
decay_steps=10000, parameters=parameter_list)
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay( optimizer = paddle.optimizer.SGD(
learning_rate=0.1, learning_rate=paddle.optimizer.lr.ExponentialDecay(
decay_steps=10000, learning_rate=0.5, gamma=0.9))
decay_rate=0.5,
staircase=True))
return optimizer return optimizer
def test_sgd(self): def test_sgd(self):
...@@ -264,21 +273,16 @@ class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): ...@@ -264,21 +273,16 @@ class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = Adam( optimizer = paddle.optimizer.Adam(
learning_rate=fluid.layers.inverse_time_decay( learning_rate=paddle.optimizer.lr.InverseTimeDecay(
learning_rate=0.1, learning_rate=0.5, gamma=0.9),
decay_steps=10000, parameters=parameter_list)
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay( optimizer = paddle.optimizer.Adam(
learning_rate=0.1, learning_rate=paddle.optimizer.lr.InverseTimeDecay(
decay_steps=10000, learning_rate=0.5, gamma=0.9))
decay_rate=0.5,
staircase=True))
return optimizer return optimizer
def test_adam(self): def test_adam(self):
...@@ -287,15 +291,16 @@ class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): ...@@ -287,15 +291,16 @@ class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer( optimizer = paddle.optimizer.SGD(
learning_rate=fluid.layers.polynomial_decay( learning_rate=paddle.optimizer.lr.PolynomialDecay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle), learning_rate=0.5, decay_steps=5, cycle=self.cycle),
parameter_list=parameter_list) parameters=parameter_list)
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay( optimizer = paddle.optimizer.SGD(
learning_rate=0.1, decay_steps=5, cycle=self.cycle)) learning_rate=paddle.optimizer.lr.PolynomialDecay(
learning_rate=0.5, decay_steps=5, cycle=self.cycle))
return optimizer return optimizer
def test_sgd_cycle(self): def test_sgd_cycle(self):
...@@ -307,17 +312,18 @@ class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): ...@@ -307,17 +312,18 @@ class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
self._check_mlp() self._check_mlp()
class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerCosineAnnealingDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer( optimizer = paddle.optimizer.SGD(
learning_rate=fluid.layers.cosine_decay( learning_rate=paddle.optimizer.lr.CosineAnnealingDecay(
learning_rate=0.1, step_each_epoch=10000, epochs=120), learning_rate=0.5, T_max=5),
parameter_list=parameter_list) parameters=parameter_list)
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay( optimizer = paddle.optimizer.SGD(
learning_rate=0.1, step_each_epoch=10000, epochs=120)) learning_rate=paddle.optimizer.lr.CosineAnnealingDecay(
learning_rate=0.5, T_max=5))
return optimizer return optimizer
def test_sgd(self): def test_sgd(self):
...@@ -326,15 +332,110 @@ class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): ...@@ -326,15 +332,110 @@ class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer( optimizer = paddle.optimizer.SGD(
learning_rate=fluid.layers.noam_decay( learning_rate=paddle.optimizer.lr.NoamDecay(
d_model=512, warmup_steps=8000), d_model=0.01, warmup_steps=100, verbose=True),
parameter_list=parameter_list) parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerLambdaDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.LambdaDecay(
learning_rate=0.5, lr_lambda=lambda epoch: 0.9**epoch),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.LambdaDecay(
learning_rate=0.5, lr_lambda=lambda epoch: 0.9**epoch))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerLinearWarmup(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.LinearWarmup(
learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.LinearWarmup(
learning_rate=0.5,
warmup_steps=20,
start_lr=0,
end_lr=0.5,
verbose=True))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerMultiStepDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerStepLR(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.StepDecay(
learning_rate=0.5, step_size=5, gamma=0.8),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.StepDecay(
learning_rate=0.5, step_size=5, gamma=0.8))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerReduceOnPlateau(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.ReduceOnPlateau(
learning_rate=0.5),
parameters=parameter_list)
return optimizer return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay( optimizer = paddle.optimizer.SGD(
d_model=512, warmup_steps=8000)) learning_rate=paddle.optimizer.lr.ReduceOnPlateau(
learning_rate=0.5))
return optimizer return optimizer
def test_sgd(self): def test_sgd(self):
...@@ -381,7 +482,7 @@ class TestOptimizerLearningRate(unittest.TestCase): ...@@ -381,7 +482,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
bd = [2, 4, 6, 8] bd = [2, 4, 6, 8]
value = [0.2, 0.4, 0.6, 0.8, 1.0] value = [0.2, 0.4, 0.6, 0.8, 1.0]
scheduler = paddle.optimizer.PiecewiseLR(bd, value) scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value)
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
scheduler, parameters=linear.parameters()) scheduler, parameters=linear.parameters())
...@@ -396,7 +497,7 @@ class TestOptimizerLearningRate(unittest.TestCase): ...@@ -396,7 +497,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0)) self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
scheduler.step() scheduler.step()
def test_lr_decay_natural_exp(self): def test_lr_scheduler_natural_exp(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
...@@ -407,8 +508,7 @@ class TestOptimizerLearningRate(unittest.TestCase): ...@@ -407,8 +508,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
loss = fluid.layers.reduce_mean(b) loss = fluid.layers.reduce_mean(b)
base_lr = 1.0 base_lr = 1.0
scheduler = paddle.optimizer.NaturalExpLR(1.0, gamma=0.5) scheduler = paddle.optimizer.lr.NaturalExpDecay(1.0, gamma=0.5)
print("scheduler.last_lr", scheduler.last_lr)
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
scheduler, parameters=linear.parameters()) scheduler, parameters=linear.parameters())
...@@ -453,7 +553,7 @@ class TestOptimizerLearningRate(unittest.TestCase): ...@@ -453,7 +553,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
with self.assertRaises(RuntimeError): with self.assertRaises(RuntimeError):
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
paddle.optimizer.NaturalExpLR( paddle.optimizer.lr.NaturalExpDecay(
learning_rate=0.1, gamma=0.5), learning_rate=0.1, gamma=0.5),
parameters=linear.parameters()) parameters=linear.parameters())
adam.set_lr(0.01) adam.set_lr(0.01)
...@@ -695,10 +795,10 @@ class TestImperativeOptimizerList(unittest.TestCase): ...@@ -695,10 +795,10 @@ class TestImperativeOptimizerList(unittest.TestCase):
linear_1 = Linear(10, 10) linear_1 = Linear(10, 10)
linear_2 = Linear(10, 10) linear_2 = Linear(10, 10)
sgd = SGDOptimizer( sgd = paddle.optimizer.SGD(1.0,
1.0, parameters=itertools.chain(
parameter_list=itertools.chain(linear_1.parameters(), linear_1.parameters(),
linear_2.parameters())) linear_2.parameters()))
in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
in_data = fluid.dygraph.to_variable(in_np) in_data = fluid.dygraph.to_variable(in_np)
......
...@@ -239,7 +239,7 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -239,7 +239,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda( place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0) ) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR( scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr) boundaries=bd, values=lr_arr)
adam = Adam( adam = Adam(
learning_rate=scheduler, parameters=ptb_model.parameters()) learning_rate=scheduler, parameters=ptb_model.parameters())
...@@ -328,7 +328,7 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -328,7 +328,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda( place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0) ) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR( scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr) boundaries=bd, values=lr_arr)
adam = Adam( adam = Adam(
learning_rate=scheduler, parameters=ptb_model.parameters()) learning_rate=scheduler, parameters=ptb_model.parameters())
...@@ -436,7 +436,7 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -436,7 +436,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda( place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0) ) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR( scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr) boundaries=bd, values=lr_arr)
adam = Adam( adam = Adam(
learning_rate=scheduler, parameters=ptb_model.parameters()) learning_rate=scheduler, parameters=ptb_model.parameters())
...@@ -544,7 +544,7 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -544,7 +544,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda( place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0) ) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR( scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr) boundaries=bd, values=lr_arr)
adam = Adam( adam = Adam(
learning_rate=scheduler, parameters=ptb_model.parameters()) learning_rate=scheduler, parameters=ptb_model.parameters())
...@@ -829,7 +829,7 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -829,7 +829,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda( place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0) ) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR( scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr) boundaries=bd, values=lr_arr)
adam = Adam( adam = Adam(
learning_rate=scheduler, learning_rate=scheduler,
......
...@@ -56,22 +56,22 @@ def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss, ...@@ -56,22 +56,22 @@ def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss,
return var_list[1] return var_list[1]
class TestReduceLROnPlateauDecay(object): class TestReduceOnPlateauDecay(object):
def test_ReduceLR(self): def test_ReduceLR(self):
# the decay rate must be less than 1.0 # the decay rate must be less than 1.0
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=2.0) paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=2.0)
# the mode must be "min" or "max" # the mode must be "min" or "max"
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, mode="test") paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, mode="test")
# the threshold_mode must be "rel" or "abs" # the threshold_mode must be "rel" or "abs"
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
paddle.optimizer.ReduceLROnPlateau( paddle.optimizer.lr.ReduceOnPlateau(
learning_rate=1.0, threshold_mode="test") learning_rate=1.0, threshold_mode="test")
with self.assertRaises(TypeError): with self.assertRaises(TypeError):
paddle.optimizer.ReduceLROnPlateau(learning_rate="test") paddle.optimizer.lr.ReduceOnPlateau(learning_rate="test")
with self.assertRaises(TypeError): with self.assertRaises(TypeError):
paddle.optimizer.ReduceLROnPlateau(learning_rate=0.5).step("test") paddle.optimizer.lr.ReduceOnPlateau(learning_rate=0.5).step("test")
places = [paddle.CPUPlace()] places = [paddle.CPUPlace()]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
...@@ -114,7 +114,7 @@ class TestReduceLROnPlateauDecay(object): ...@@ -114,7 +114,7 @@ class TestReduceLROnPlateauDecay(object):
[1], 1, 'float32', persistable=True) [1], 1, 'float32', persistable=True)
paddle.increment(x) paddle.increment(x)
loss = paddle.sin(x) loss = paddle.sin(x)
scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs) scheduler = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
adam = paddle.optimizer.Adam(learning_rate=scheduler) adam = paddle.optimizer.Adam(learning_rate=scheduler)
adam.minimize(loss) adam.minimize(loss)
lr_var = adam._global_learning_rate() lr_var = adam._global_learning_rate()
...@@ -158,7 +158,7 @@ class TestReduceLROnPlateauDecay(object): ...@@ -158,7 +158,7 @@ class TestReduceLROnPlateauDecay(object):
var_list = [best, current_lr, cooldown_counter, num_bad_epochs] var_list = [best, current_lr, cooldown_counter, num_bad_epochs]
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs) scheduler = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
learning_rate=scheduler, parameters=linear.parameters()) learning_rate=scheduler, parameters=linear.parameters())
...@@ -180,7 +180,7 @@ class TestReduceLROnPlateauDecay(object): ...@@ -180,7 +180,7 @@ class TestReduceLROnPlateauDecay(object):
loss, var_list) loss, var_list)
self.assertEqual(current_lr, expected_lr) self.assertEqual(current_lr, expected_lr)
state_dict = adam.state_dict() state_dict = adam.state_dict()
scheduler1 = paddle.optimizer.ReduceLROnPlateau(**kwargs) scheduler1 = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
adam1 = paddle.optimizer.Adam( adam1 = paddle.optimizer.Adam(
learning_rate=scheduler1, parameters=linear.parameters()) learning_rate=scheduler1, parameters=linear.parameters())
adam1.set_state_dict(state_dict) adam1.set_state_dict(state_dict)
...@@ -420,7 +420,7 @@ class TestLRScheduler(unittest.TestCase): ...@@ -420,7 +420,7 @@ class TestLRScheduler(unittest.TestCase):
adam.clear_grad() adam.clear_grad()
current_lr = adam.get_lr() current_lr = adam.get_lr()
expected_lr = python_func(epoch, **kwarg) expected_lr = python_func(epoch, **kwarg)
if paddle_api.__name__ != "CosineAnnealingLR": if paddle_api.__name__ != "CosineAnnealingDecay":
self.assertEqual(current_lr, expected_lr) self.assertEqual(current_lr, expected_lr)
scheduler.step() scheduler.step()
else: else:
...@@ -429,74 +429,75 @@ class TestLRScheduler(unittest.TestCase): ...@@ -429,74 +429,75 @@ class TestLRScheduler(unittest.TestCase):
def test_scheduler(self): def test_scheduler(self):
with self.assertRaises(NotImplementedError): with self.assertRaises(NotImplementedError):
paddle.optimizer.lr_scheduler._LRScheduler().step() paddle.optimizer.lr.LRScheduler().step()
with self.assertRaises(TypeError): with self.assertRaises(TypeError):
paddle.optimizer.MultiStepLR( paddle.optimizer.lr.MultiStepDecay(
learning_rate="test", milestones=[1, 2, 3]) learning_rate="test", milestones=[1, 2, 3])
with self.assertRaises(TypeError): with self.assertRaises(TypeError):
paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones='test') paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones='test')
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
paddle.optimizer.MultiStepLR( paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones=[3, 2, 1]) learning_rate=0.5, milestones=[3, 2, 1])
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
paddle.optimizer.MultiStepLR( paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones=[1, 2, 3], gamma=2) learning_rate=0.5, milestones=[1, 2, 3], gamma=2)
func_api_kwargs = [(noam_lr, paddle.optimizer.NoamLR, { func_api_kwargs = [(noam_lr, paddle.optimizer.lr.NoamDecay, {
"d_model": 0.01, "d_model": 0.01,
"warmup_steps": 100, "warmup_steps": 100,
"verbose": False "verbose": False
}), (piecewise_lr, paddle.optimizer.PiecewiseLR, { }), (piecewise_lr, paddle.optimizer.lr.PiecewiseDecay, {
"boundaries": [3, 6, 9, 15, 20], "boundaries": [3, 6, 9, 15, 20],
"values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
"verbose": False "verbose": False
}), (natural_exp_lr, paddle.optimizer.NaturalExpLR, { }), (natural_exp_lr, paddle.optimizer.lr.NaturalExpDecay, {
"learning_rate": 0.5, "learning_rate": 0.5,
"gamma": 0.1, "gamma": 0.1,
"verbose": True "verbose": True
}), (inverse_time_lr, paddle.optimizer.InverseTimeLR, { }), (inverse_time_lr, paddle.optimizer.lr.InverseTimeDecay, {
"learning_rate": 0.5, "learning_rate": 0.5,
"gamma": 0.1, "gamma": 0.1,
"verbose": False "verbose": False
}), (polynomial_lr, paddle.optimizer.PolynomialLR, { }), (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, {
"learning_rate": 0.5, "learning_rate": 0.5,
"decay_steps": 20, "decay_steps": 20,
"end_lr": 0, "end_lr": 0,
"power": 1.0, "power": 1.0,
"cycle": False, "cycle": False,
"verbose": True "verbose": True
}), (polynomial_lr, paddle.optimizer.PolynomialLR, { }), (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, {
"learning_rate": 0.5, "learning_rate": 0.5,
"decay_steps": 20, "decay_steps": 20,
"end_lr": 0, "end_lr": 0,
"power": 1.0, "power": 1.0,
"cycle": True, "cycle": True,
"verbose": False "verbose": False
}), (linear_warmup_lr, paddle.optimizer.LinearLrWarmup, { }), (linear_warmup_lr, paddle.optimizer.lr.LinearWarmup, {
'learning_rate': 0.5, 'learning_rate': 0.5,
'warmup_steps': 20, 'warmup_steps': 20,
'start_lr': 0, 'start_lr': 0,
'end_lr': 0.5, 'end_lr': 0.5,
"verbose": True "verbose": True
}), (exponential_lr, paddle.optimizer.ExponentialLR, { }), (exponential_lr, paddle.optimizer.lr.ExponentialDecay, {
"learning_rate": 0.5, "learning_rate": 0.5,
"gamma": 0.9, "gamma": 0.9,
"verbose": False "verbose": False
}), (multi_step_lr, paddle.optimizer.MultiStepLR, { }), (multi_step_lr, paddle.optimizer.lr.MultiStepDecay, {
"learning_rate": 0.5, "learning_rate": 0.5,
"milestones": [3, 6, 9, 15, 20], "milestones": [3, 6, 9, 15, 20],
"gamma": 0.8, "gamma": 0.8,
"verbose": True "verbose": True
}), (step_lr, paddle.optimizer.StepLR, { }), (step_lr, paddle.optimizer.lr.StepDecay, {
"learning_rate": 0.5, "learning_rate": 0.5,
"step_size": 2, "step_size": 2,
"gamma": 0.8, "gamma": 0.8,
"verbose": False "verbose": False
}), (lambda_lr, paddle.optimizer.LambdaLR, { }), (lambda_lr, paddle.optimizer.lr.LambdaDecay, {
"learning_rate": 0.5, "learning_rate": 0.5,
"lr_lambda": lambda x: 0.95**x, "lr_lambda": lambda x: 0.95**x,
"verbose": True "verbose": True
}), (cosine_annealing_lr, paddle.optimizer.CosineAnnealingLR, { }), (cosine_annealing_lr, paddle.optimizer.lr.CosineAnnealingDecay, {
"learning_rate": 0.5, "learning_rate": 0.5,
"T_max": 10, "T_max": 10,
"verbose": False "verbose": False
......
...@@ -24,11 +24,6 @@ __all__ += [ ...@@ -24,11 +24,6 @@ __all__ += [
'DataParallel' 'DataParallel'
] ]
__all__ += [
'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay'
]
from . import random from . import random
from .random import manual_seed from .random import manual_seed
from .framework import get_default_dtype from .framework import get_default_dtype
...@@ -51,11 +46,3 @@ from ..fluid.dygraph.base import grad #DEFINE_ALIAS ...@@ -51,11 +46,3 @@ from ..fluid.dygraph.base import grad #DEFINE_ALIAS
from .io import save from .io import save
from .io import load from .io import load
from ..fluid.dygraph.parallel import DataParallel #DEFINE_ALIAS from ..fluid.dygraph.parallel import DataParallel #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import NoamDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import PiecewiseDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import NaturalExpDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import ExponentialDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import InverseTimeDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import PolynomialDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import CosineDecay #DEFINE_ALIAS
...@@ -228,7 +228,7 @@ def save(obj, path): ...@@ -228,7 +228,7 @@ def save(obj, path):
layer_state_dict = emb.state_dict() layer_state_dict = emb.state_dict()
paddle.save(layer_state_dict, "emb.pdparams") paddle.save(layer_state_dict, "emb.pdparams")
scheduler = paddle.optimizer.lr_scheduler.NoamLR( scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True) d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
learning_rate=scheduler, learning_rate=scheduler,
...@@ -320,7 +320,7 @@ def load(path, **configs): ...@@ -320,7 +320,7 @@ def load(path, **configs):
layer_state_dict = emb.state_dict() layer_state_dict = emb.state_dict()
paddle.save(layer_state_dict, "emb.pdparams") paddle.save(layer_state_dict, "emb.pdparams")
scheduler = paddle.optimizer.lr_scheduler.NoamLR( scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True) d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
learning_rate=scheduler, learning_rate=scheduler,
......
...@@ -121,13 +121,6 @@ from .layer.conv import ConvTranspose3d #DEFINE_ALIAS ...@@ -121,13 +121,6 @@ from .layer.conv import ConvTranspose3d #DEFINE_ALIAS
# from .layer.conv import TreeConv #DEFINE_ALIAS # from .layer.conv import TreeConv #DEFINE_ALIAS
# from .layer.conv import Conv1D #DEFINE_ALIAS # from .layer.conv import Conv1D #DEFINE_ALIAS
from .layer.extension import RowConv #DEFINE_ALIAS from .layer.extension import RowConv #DEFINE_ALIAS
# from .layer.learning_rate import CosineDecay #DEFINE_ALIAS
# from .layer.learning_rate import ExponentialDecay #DEFINE_ALIAS
# from .layer.learning_rate import InverseTimeDecay #DEFINE_ALIAS
# from .layer.learning_rate import NaturalExpDecay #DEFINE_ALIAS
# from .layer.learning_rate import NoamDecay #DEFINE_ALIAS
# from .layer.learning_rate import PiecewiseDecay #DEFINE_ALIAS
# from .layer.learning_rate import PolynomialDecay #DEFINE_ALIAS
from .layer.common import Linear from .layer.common import Linear
# from .layer.loss import NCELoss #DEFINE_ALIAS # from .layer.loss import NCELoss #DEFINE_ALIAS
from .layer.loss import BCEWithLogitsLoss #DEFINE_ALIAS from .layer.loss import BCEWithLogitsLoss #DEFINE_ALIAS
......
...@@ -95,14 +95,6 @@ from .extension import target_assign #DEFINE_ALIAS ...@@ -95,14 +95,6 @@ from .extension import target_assign #DEFINE_ALIAS
from .extension import temporal_shift #DEFINE_ALIAS from .extension import temporal_shift #DEFINE_ALIAS
from .extension import warpctc #DEFINE_ALIAS from .extension import warpctc #DEFINE_ALIAS
from .extension import diag_embed #DEFINE_ALIAS from .extension import diag_embed #DEFINE_ALIAS
from .learning_rate import cosine_decay #DEFINE_ALIAS
from .learning_rate import exponential_decay #DEFINE_ALIAS
from .learning_rate import inverse_time_decay #DEFINE_ALIAS
from .learning_rate import natural_exp_decay #DEFINE_ALIAS
from .learning_rate import noam_decay #DEFINE_ALIAS
from .learning_rate import piecewise_decay #DEFINE_ALIAS
from .learning_rate import polynomial_decay #DEFINE_ALIAS
from .learning_rate import linear_lr_warmup #DEFINE_ALIAS
# from .lod import sequence_concat #DEFINE_ALIAS # from .lod import sequence_concat #DEFINE_ALIAS
# from .lod import sequence_conv #DEFINE_ALIAS # from .lod import sequence_conv #DEFINE_ALIAS
# from .lod import sequence_enumerate #DEFINE_ALIAS # from .lod import sequence_enumerate #DEFINE_ALIAS
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: define learning rate decay
from ...fluid.layers import cosine_decay #DEFINE_ALIAS
from ...fluid.layers import exponential_decay #DEFINE_ALIAS
from ...fluid.layers import inverse_time_decay #DEFINE_ALIAS
from ...fluid.layers import natural_exp_decay #DEFINE_ALIAS
from ...fluid.layers import noam_decay #DEFINE_ALIAS
from ...fluid.layers import piecewise_decay #DEFINE_ALIAS
from ...fluid.layers import polynomial_decay #DEFINE_ALIAS
from ...fluid.layers import linear_lr_warmup #DEFINE_ALIAS
__all__ = [
'cosine_decay', 'exponential_decay', 'inverse_time_decay',
'natural_exp_decay', 'noam_decay', 'piecewise_decay', 'polynomial_decay',
'linear_lr_warmup'
]
...@@ -86,13 +86,6 @@ from .conv import ConvTranspose3d #DEFINE_ALIAS ...@@ -86,13 +86,6 @@ from .conv import ConvTranspose3d #DEFINE_ALIAS
# from .conv import TreeConv #DEFINE_ALIAS # from .conv import TreeConv #DEFINE_ALIAS
# from .conv import Conv1D #DEFINE_ALIAS # from .conv import Conv1D #DEFINE_ALIAS
from .extension import RowConv #DEFINE_ALIAS from .extension import RowConv #DEFINE_ALIAS
# from .learning_rate import CosineDecay #DEFINE_ALIAS
# from .learning_rate import ExponentialDecay #DEFINE_ALIAS
# from .learning_rate import InverseTimeDecay #DEFINE_ALIAS
# from .learning_rate import NaturalExpDecay #DEFINE_ALIAS
# from .learning_rate import NoamDecay #DEFINE_ALIAS
# from .learning_rate import PiecewiseDecay #DEFINE_ALIAS
# from .learning_rate import PolynomialDecay #DEFINE_ALIAS
# from .loss import NCELoss #DEFINE_ALIAS # from .loss import NCELoss #DEFINE_ALIAS
from .loss import BCEWithLogitsLoss #DEFINE_ALIAS from .loss import BCEWithLogitsLoss #DEFINE_ALIAS
from .loss import CrossEntropyLoss #DEFINE_ALIAS from .loss import CrossEntropyLoss #DEFINE_ALIAS
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: define learning rate decay
__all__ = [
# 'CosineDecay',
# 'ExponentialDecay',
# 'InverseTimeDecay',
# 'NaturalExpDecay',
# 'NoamDecay',
# 'PiecewiseDecay',
# 'PolynomialDecay'
]
...@@ -16,10 +16,7 @@ __all__ = [ ...@@ -16,10 +16,7 @@ __all__ = [
'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam', 'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam',
'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer', 'Dpsgd', 'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer', 'Dpsgd',
'DpsgdOptimizer', 'Ftrl', 'FtrlOptimizer', 'Momentum', 'MomentumOptimizer', 'DpsgdOptimizer', 'Ftrl', 'FtrlOptimizer', 'Momentum', 'MomentumOptimizer',
'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer', '_LRScheduler', 'NoamLR', 'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer'
'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR',
'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR',
'ReduceLROnPlateau', 'CosineAnnealingLR'
] ]
...@@ -36,6 +33,4 @@ from .adadelta import Adadelta ...@@ -36,6 +33,4 @@ from .adadelta import Adadelta
from .sgd import SGD from .sgd import SGD
from .momentum import Momentum from .momentum import Momentum
from . import lr_scheduler from . import lr
from .lr_scheduler import _LRScheduler, NoamLR, PiecewiseLR, NaturalExpLR, InverseTimeLR, PolynomialLR, \
LinearLrWarmup, ExponentialLR, MultiStepLR, StepLR, LambdaLR, ReduceLROnPlateau, CosineAnnealingLR
...@@ -48,8 +48,8 @@ class Adam(Optimizer): ...@@ -48,8 +48,8 @@ class Adam(Optimizer):
Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_ Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
Args: Args:
learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``. learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler. The default value is 0.001. It can be a float value or a LRScheduler. The default value is 0.001.
beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates. beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
It should be a float number or a Tensor with shape [1] and data type as float32. It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 0.9. The default value is 0.9.
......
...@@ -47,8 +47,8 @@ class Adamax(Optimizer): ...@@ -47,8 +47,8 @@ class Adamax(Optimizer):
it is added here for numerical stability to prevent the division by 0 error. it is added here for numerical stability to prevent the division by 0 error.
Args: Args:
learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``. learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler. The default value is 0.001. It can be a float value or a LRScheduler. The default value is 0.001.
beta1 (float, optional): The exponential decay rate for the 1st moment estimates. beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
The default value is 0.9. The default value is 0.9.
beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
......
...@@ -42,8 +42,8 @@ class AdamW(Adam): ...@@ -42,8 +42,8 @@ class AdamW(Adam):
Args: Args:
learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``. learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler. The default value is 0.001. It can be a float value or a LRScheduler. The default value is 0.001.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated. The default value is None in static mode, at this time all parameters will be updated.
......
...@@ -18,18 +18,62 @@ import warnings ...@@ -18,18 +18,62 @@ import warnings
from paddle import Tensor from paddle import Tensor
__all__ = [ __all__ = [
'NoamLR', 'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR', 'LRScheduler', 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay',
'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR', 'InverseTimeDecay', 'PolynomialDecay', 'LinearWarmup', 'ExponentialDecay',
'ReduceLROnPlateau', 'CosineAnnealingLR' 'MultiStepDecay', 'StepDecay', 'LambdaDecay', 'ReduceOnPlateau',
'CosineAnnealingDecay'
] ]
class _LRScheduler(object): class LRScheduler(object):
"""LRScheduler Base class. """
LRScheduler Base class. Define the common interface of a learning rate scheduler.
User can import it by ``form paddle.optimizer.lr import LRScheduler`` ,
then overload it for your subclass and have a custom implementation of ``get_lr()`` .
Otherwise, an ``NotImplementedError`` exception will be thrown.
Args:
learning_rate (float): The initial learning rate. It is a python float number.
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns:
instance to schedule learning rate.
Examples:
Here is an example of a simple ``StepDecay`` implementation.
.. code-block:: python
import paddle
form paddle.optimizer.lr import LRScheduler
class StepDecay(LRScheduler):
def __init__(self,
learning_rate,
step_size,
gamma=0.1,
last_epoch=-1,
verbose=False):
if not isinstance(step_size, int):
raise TypeError(
"The type of 'step_size' must be 'int', but received %s." %
type(step_size))
if gamma >= 1.0:
raise ValueError('gamma should be < 1.0.')
self.step_size = step_size
self.gamma = gamma
super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)
def get_lr(self):
i = self.last_epoch // self.step_size
return self.base_lr * (self.gamma**i)
Define the common interface of an LRScheduler.
User can 'form paddle.optimizer.lr_scheduler import _LRScheduler'
And inherit from it to have a custom implementation of get_lr().
""" """
def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False): def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
...@@ -47,23 +91,22 @@ class _LRScheduler(object): ...@@ -47,23 +91,22 @@ class _LRScheduler(object):
def __call__(self): def __call__(self):
""" """
Return last computed learning rate on current epoch. Return lastest computed learning rate on current epoch.
""" """
return self.last_lr return self.last_lr
def step(self, epoch=None): def step(self, epoch=None):
""" """
'step' should be called after 'minimize' . It will update the learning rate in optimizer according to 'epoch'.
The new learning rate will take effect on next epoch. ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
The new learning rate will take effect on next ``optimizer.step`` .
Args: Args:
epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1. epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
Returns: Returns:
None None
Examples:
Please refer to the example of current _LRScheduler.
""" """
if epoch is None: if epoch is None:
self.last_epoch += 1 self.last_epoch += 1
...@@ -81,11 +124,12 @@ class _LRScheduler(object): ...@@ -81,11 +124,12 @@ class _LRScheduler(object):
def state_dict(self): def state_dict(self):
""" """
Returns the state of the scheduler as a :class:`dict`. Returns the state of the scheduler as a :class:`dict`.
It is a subset of self.__dict__ . It is a subset of ``self.__dict__`` .
""" """
self._state_keys() self.state_keys()
state_dict = {} state_dict = {}
for key in self.keys: for key in self.keys:
if key not in self.__dict__: if key not in self.__dict__:
...@@ -101,19 +145,26 @@ class _LRScheduler(object): ...@@ -101,19 +145,26 @@ class _LRScheduler(object):
return state_dict return state_dict
# For those subclass who overload _LRScheduler, "last_epoch, last_lr" will be saved by default. # For those subclass who overload LRScheduler, "last_epoch, last_lr" will be saved by default.
# (Note): you can change it for your subclass. # (Note): you can change it for your subclass.
def _state_keys(self): def state_keys(self):
""" """
set the keys in self.__dict__ that are needed to be saved.
For those subclass who overload ``LRScheduler`` (Base Class). Acquiescently, "last_epoch, last_lr" will be saved by ``self.keys = ['last_epoch', 'last_lr']`` .
``last_epoch`` is the current epoch num, and ``last_lr`` is the current learning rate.
If you want to change the default behavior, you should have a custom implementation of ``_state_keys()`` to redefine ``self.keys`` .
""" """
self.keys = ['last_epoch', 'last_lr'] self.keys = ['last_epoch', 'last_lr']
def set_state_dict(self, state_dict): def set_state_dict(self, state_dict):
""" """
Loads the schedulers state. Loads the schedulers state.
""" """
self._state_keys() self.state_keys()
for key in self.keys: for key in self.keys:
if key in state_dict: if key in state_dict:
self.__dict__[key] = state_dict[key] self.__dict__[key] = state_dict[key]
...@@ -130,14 +181,20 @@ class _LRScheduler(object): ...@@ -130,14 +181,20 @@ class _LRScheduler(object):
set_dict = set_state_dict set_dict = set_state_dict
def get_lr(self): def get_lr(self):
"""
For those subclass who overload ``LRScheduler`` (Base Class), User should have a custom implementation of ``get_lr()`` .
Otherwise, an ``NotImplementedError`` exception will be thrown.
"""
# calculate by python float # calculate by python float
raise NotImplementedError raise NotImplementedError
class NoamLR(_LRScheduler): class NoamDecay(LRScheduler):
""" """
Applies Noam Lear to the initial learning rate. Applies Noam Decay to the initial learning rate.
The algorithm can be described as following. The algorithm can be described as following.
...@@ -156,7 +213,7 @@ class NoamLR(_LRScheduler): ...@@ -156,7 +213,7 @@ class NoamLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns: Returns:
``NoamLR`` instance to schedule learning rate. ``NoamDecay`` instance to schedule learning rate.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -164,23 +221,21 @@ class NoamLR(_LRScheduler): ...@@ -164,23 +221,21 @@ class NoamLR(_LRScheduler):
import paddle import paddle
import numpy as np import numpy as np
# train on default dygraph mode # train on default dynamic graph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True) scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20): for epoch in range(20):
for batch_id in range(2): for batch_id in range(2):
x = paddle.to_tensor(x) x = paddle.uniform([10, 10])
out = linear(x) out = linear(x)
loss = paddle.reduce_mean(out) loss = paddle.reduce_mean(out)
loss.backward() loss.backward()
sgd.minimize(loss) sgd.step()
linear.clear_gradients() sgd.clear_gradients()
scheduler.step() scheduler.step()
# train on static mode # train on static graph mode
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
start_prog = paddle.static.Program() start_prog = paddle.static.Program()
...@@ -189,7 +244,7 @@ class NoamLR(_LRScheduler): ...@@ -189,7 +244,7 @@ class NoamLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5]) y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100) z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z) loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True) scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss) sgd.minimize(loss)
...@@ -216,7 +271,7 @@ class NoamLR(_LRScheduler): ...@@ -216,7 +271,7 @@ class NoamLR(_LRScheduler):
verbose=False): verbose=False):
self.d_model = d_model self.d_model = d_model
self.warmup_steps = warmup_steps self.warmup_steps = warmup_steps
super(NoamLR, self).__init__(learning_rate, last_epoch, verbose) super(NoamDecay, self).__init__(learning_rate, last_epoch, verbose)
def get_lr(self): def get_lr(self):
if self.last_epoch == 0: if self.last_epoch == 0:
...@@ -227,7 +282,7 @@ class NoamLR(_LRScheduler): ...@@ -227,7 +282,7 @@ class NoamLR(_LRScheduler):
return self.base_lr * (self.d_model**-0.5) * min(a, b) return self.base_lr * (self.d_model**-0.5) * min(a, b)
class PiecewiseLR(_LRScheduler): class PiecewiseDecay(LRScheduler):
""" """
Piecewise learning rate scheduler. Piecewise learning rate scheduler.
...@@ -253,7 +308,7 @@ class PiecewiseLR(_LRScheduler): ...@@ -253,7 +308,7 @@ class PiecewiseLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns: Returns:
``PiecewiseLR`` instance to schedule learning rate. ``PiecewiseDecay`` instance to schedule learning rate.
Examples: Examples:
...@@ -262,23 +317,21 @@ class PiecewiseLR(_LRScheduler): ...@@ -262,23 +317,21 @@ class PiecewiseLR(_LRScheduler):
import paddle import paddle
import numpy as np import numpy as np
# train on default dygraph mode # train on default dynamic graph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True) scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20): for epoch in range(20):
for batch_id in range(2): for batch_id in range(2):
x = paddle.to_tensor(x) x = paddle.uniform([10, 10])
out = linear(x) out = linear(x)
loss = paddle.reduce_mean(out) loss = paddle.reduce_mean(out)
loss.backward() loss.backward()
sgd.minimize(loss) sgd.step()
linear.clear_gradients() sgd.clear_gradients()
scheduler.step() scheduler.step()
# train on static mode # train on static graph mode
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
start_prog = paddle.static.Program() start_prog = paddle.static.Program()
...@@ -287,7 +340,7 @@ class PiecewiseLR(_LRScheduler): ...@@ -287,7 +340,7 @@ class PiecewiseLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5]) y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100) z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z) loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True) scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss) sgd.minimize(loss)
...@@ -308,7 +361,7 @@ class PiecewiseLR(_LRScheduler): ...@@ -308,7 +361,7 @@ class PiecewiseLR(_LRScheduler):
def __init__(self, boundaries, values, last_epoch=-1, verbose=False): def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
self.boundaries = boundaries self.boundaries = boundaries
self.values = values self.values = values
super(PiecewiseLR, self).__init__( super(PiecewiseDecay, self).__init__(
last_epoch=last_epoch, verbose=verbose) last_epoch=last_epoch, verbose=verbose)
def get_lr(self): def get_lr(self):
...@@ -319,7 +372,7 @@ class PiecewiseLR(_LRScheduler): ...@@ -319,7 +372,7 @@ class PiecewiseLR(_LRScheduler):
return self.values[len(self.values) - 1] return self.values[len(self.values) - 1]
class NaturalExpLR(_LRScheduler): class NaturalExpDecay(LRScheduler):
""" """
Applies natural exponential decay to the initial learning rate. Applies natural exponential decay to the initial learning rate.
...@@ -328,7 +381,7 @@ class NaturalExpLR(_LRScheduler): ...@@ -328,7 +381,7 @@ class NaturalExpLR(_LRScheduler):
.. math:: .. math::
new\_learning\_rate = learning\_rate * e^{- gama * epoch} new\_learning\_rate = learning\_rate * e^{- gamma * epoch}
Args: Args:
learning_rate (float): The initial learning rate. It is a python float number. learning_rate (float): The initial learning rate. It is a python float number.
...@@ -337,7 +390,7 @@ class NaturalExpLR(_LRScheduler): ...@@ -337,7 +390,7 @@ class NaturalExpLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns: Returns:
``NaturalExpLR`` instance to schedule learning rate. ``NaturalExpDecay`` instance to schedule learning rate.
Examples: Examples:
...@@ -346,23 +399,21 @@ class NaturalExpLR(_LRScheduler): ...@@ -346,23 +399,21 @@ class NaturalExpLR(_LRScheduler):
import paddle import paddle
import numpy as np import numpy as np
# train on default dygraph mode # train on default dynamic graph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True) scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20): for epoch in range(20):
for batch_id in range(2): for batch_id in range(2):
x = paddle.to_tensor(x) x = paddle.uniform([10, 10])
out = linear(x) out = linear(x)
loss = paddle.reduce_mean(out) loss = paddle.reduce_mean(out)
loss.backward() loss.backward()
sgd.minimize(loss) sgd.step()
linear.clear_gradients() sgd.clear_gradients()
scheduler.step() scheduler.step()
# train on static mode # train on static graph mode
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
start_prog = paddle.static.Program() start_prog = paddle.static.Program()
...@@ -371,7 +422,7 @@ class NaturalExpLR(_LRScheduler): ...@@ -371,7 +422,7 @@ class NaturalExpLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5]) y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100) z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z) loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True) scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss) sgd.minimize(loss)
...@@ -391,13 +442,14 @@ class NaturalExpLR(_LRScheduler): ...@@ -391,13 +442,14 @@ class NaturalExpLR(_LRScheduler):
def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False): def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
self.gamma = gamma self.gamma = gamma
super(NaturalExpLR, self).__init__(learning_rate, last_epoch, verbose) super(NaturalExpDecay, self).__init__(learning_rate, last_epoch,
verbose)
def get_lr(self): def get_lr(self):
return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch) return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch)
class InverseTimeLR(_LRScheduler): class InverseTimeDecay(LRScheduler):
""" """
Applies inverse time decay to the initial learning rate. Applies inverse time decay to the initial learning rate.
...@@ -416,7 +468,7 @@ class InverseTimeLR(_LRScheduler): ...@@ -416,7 +468,7 @@ class InverseTimeLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns: Returns:
``InverseTimeLR`` instance to schedule learning rate. ``InverseTimeDecay`` instance to schedule learning rate.
Examples: Examples:
...@@ -425,23 +477,21 @@ class InverseTimeLR(_LRScheduler): ...@@ -425,23 +477,21 @@ class InverseTimeLR(_LRScheduler):
import paddle import paddle
import numpy as np import numpy as np
# train on default dygraph mode # train on default dynamic graph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True) scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20): for epoch in range(20):
for batch_id in range(2): for batch_id in range(2):
x = paddle.to_tensor(x) x = paddle.uniform([10, 10])
out = linear(x) out = linear(x)
loss = paddle.reduce_mean(out) loss = paddle.reduce_mean(out)
loss.backward() loss.backward()
sgd.minimize(loss) sgd.step()
linear.clear_gradients() sgd.clear_gradients()
scheduler.step() scheduler.step()
# train on static mode # train on static graph mode
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
start_prog = paddle.static.Program() start_prog = paddle.static.Program()
...@@ -450,7 +500,7 @@ class InverseTimeLR(_LRScheduler): ...@@ -450,7 +500,7 @@ class InverseTimeLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5]) y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100) z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z) loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True) scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss) sgd.minimize(loss)
...@@ -471,13 +521,14 @@ class InverseTimeLR(_LRScheduler): ...@@ -471,13 +521,14 @@ class InverseTimeLR(_LRScheduler):
def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False): def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
self.gamma = gamma self.gamma = gamma
super(InverseTimeLR, self).__init__(learning_rate, last_epoch, verbose) super(InverseTimeDecay, self).__init__(learning_rate, last_epoch,
verbose)
def get_lr(self): def get_lr(self):
return self.base_lr / (1 + self.gamma * self.last_epoch) return self.base_lr / (1 + self.gamma * self.last_epoch)
class PolynomialLR(_LRScheduler): class PolynomialDecay(LRScheduler):
""" """
Applies polynomial decay to the initial learning rate. Applies polynomial decay to the initial learning rate.
...@@ -512,7 +563,7 @@ class PolynomialLR(_LRScheduler): ...@@ -512,7 +563,7 @@ class PolynomialLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns: Returns:
``PolynomialLR`` instance to schedule learning rate. ``PolynomialDecay`` instance to schedule learning rate.
Examples: Examples:
...@@ -521,23 +572,21 @@ class PolynomialLR(_LRScheduler): ...@@ -521,23 +572,21 @@ class PolynomialLR(_LRScheduler):
import paddle import paddle
import numpy as np import numpy as np
# train on default dygraph mode # train on default dynamic graph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True) scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20): for epoch in range(20):
for batch_id in range(2): for batch_id in range(2):
x = paddle.to_tensor(x) x = paddle.uniform([10, 10])
out = linear(x) out = linear(x)
loss = paddle.reduce_mean(out) loss = paddle.reduce_mean(out)
loss.backward() loss.backward()
sgd.minimize(loss) sgd.step()
linear.clear_gradients() sgd.clear_gradients()
scheduler.step() scheduler.step()
# train on static mode # train on static graph mode
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
start_prog = paddle.static.Program() start_prog = paddle.static.Program()
...@@ -546,7 +595,7 @@ class PolynomialLR(_LRScheduler): ...@@ -546,7 +595,7 @@ class PolynomialLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5]) y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100) z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z) loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True) scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss) sgd.minimize(loss)
...@@ -576,7 +625,8 @@ class PolynomialLR(_LRScheduler): ...@@ -576,7 +625,8 @@ class PolynomialLR(_LRScheduler):
self.end_lr = end_lr self.end_lr = end_lr
self.power = power self.power = power
self.cycle = cycle self.cycle = cycle
super(PolynomialLR, self).__init__(learning_rate, last_epoch, verbose) super(PolynomialDecay, self).__init__(learning_rate, last_epoch,
verbose)
def get_lr(self): def get_lr(self):
tmp_epoch_num = self.last_epoch tmp_epoch_num = self.last_epoch
...@@ -596,7 +646,7 @@ class PolynomialLR(_LRScheduler): ...@@ -596,7 +646,7 @@ class PolynomialLR(_LRScheduler):
)**self.power) + self.end_lr )**self.power) + self.end_lr
class LinearLrWarmup(_LRScheduler): class LinearWarmup(LRScheduler):
""" """
Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler. Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
...@@ -604,22 +654,22 @@ class LinearLrWarmup(_LRScheduler): ...@@ -604,22 +654,22 @@ class LinearLrWarmup(_LRScheduler):
When epoch < warmup_steps, learning rate is updated as: When epoch < warmup_steps, learning rate is updated as:
.. code-block:: text .. math::
lr = start_lr + (end_lr - start_lr) * (epoch / warmup_steps) lr = start\_lr + (end\_lr - start\_lr) * \\frac{epoch}{warmup\_steps}
where start_lr is the initial learning rate, and end_lr is the final learning rate; where start_lr is the initial learning rate, and end_lr is the final learning rate;
When epoch >= warmup_steps, learning rate is updated as: When epoch >= warmup_steps, learning rate is updated as:
.. code-block:: text .. math::
lr = learning_rate lr = learning_rate
where lr is float or any subclass of ``_LRScheduler`` . where ``learning_rate`` is float or any subclass of ``LRScheduler`` .
Args: Args:
learning_rate (float|_LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``_LRScheduler`` . learning_rate (float|LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``LRScheduler`` .
warmup_steps (int): total steps of warm up. warmup_steps (int): total steps of warm up.
start_lr (float): Initial learning rate of warm up. start_lr (float): Initial learning rate of warm up.
end_lr (float): Final learning rate of warm up. end_lr (float): Final learning rate of warm up.
...@@ -627,7 +677,7 @@ class LinearLrWarmup(_LRScheduler): ...@@ -627,7 +677,7 @@ class LinearLrWarmup(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns: Returns:
``LinearLrWarmup`` instance to schedule learning rate. ``LinearWarmup`` instance to schedule learning rate.
Examples: Examples:
...@@ -636,24 +686,22 @@ class LinearLrWarmup(_LRScheduler): ...@@ -636,24 +686,22 @@ class LinearLrWarmup(_LRScheduler):
import paddle import paddle
import numpy as np import numpy as np
# train on default dygraph mode # train on default dynamic graph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.LinearLrWarmup( scheduler = paddle.optimizer.lr.LinearWarmup(
learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True) learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20): for epoch in range(20):
for batch_id in range(2): for batch_id in range(2):
x = paddle.to_tensor(x) x = paddle.uniform([10, 10])
out = linear(x) out = linear(x)
loss = paddle.reduce_mean(out) loss = paddle.reduce_mean(out)
loss.backward() loss.backward()
sgd.minimize(loss) sgd.step()
linear.clear_gradients() sgd.clear_gradients()
scheduler.step() scheduler.step()
# train on static mode # train on static graph mode
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
start_prog = paddle.static.Program() start_prog = paddle.static.Program()
...@@ -662,7 +710,7 @@ class LinearLrWarmup(_LRScheduler): ...@@ -662,7 +710,7 @@ class LinearLrWarmup(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5]) y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100) z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z) loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.LinearLrWarmup( scheduler = paddle.optimizer.lr.LinearWarmup(
learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True) learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss) sgd.minimize(loss)
...@@ -678,7 +726,7 @@ class LinearLrWarmup(_LRScheduler): ...@@ -678,7 +726,7 @@ class LinearLrWarmup(_LRScheduler):
'y': np.random.randn(3, 4, 5).astype('float32') 'y': np.random.randn(3, 4, 5).astype('float32')
}, },
fetch_list=loss.name) fetch_list=loss.name)
scheduler.step() scheduler.step()
""" """
def __init__(self, def __init__(self,
...@@ -689,10 +737,10 @@ class LinearLrWarmup(_LRScheduler): ...@@ -689,10 +737,10 @@ class LinearLrWarmup(_LRScheduler):
last_epoch=-1, last_epoch=-1,
verbose=False): verbose=False):
type_check = isinstance(learning_rate, float) or isinstance( type_check = isinstance(learning_rate, float) or isinstance(
learning_rate, int) or isinstance(learning_rate, _LRScheduler) learning_rate, int) or isinstance(learning_rate, LRScheduler)
if not type_check: if not type_check:
raise TypeError( raise TypeError(
"the type of learning_rate should be [int, float or _LRScheduler], the current type is {}". "the type of learning_rate should be [int, float or LRScheduler], the current type is {}".
format(learning_rate)) format(learning_rate))
self.learning_rate = learning_rate self.learning_rate = learning_rate
self.warmup_steps = warmup_steps self.warmup_steps = warmup_steps
...@@ -700,24 +748,24 @@ class LinearLrWarmup(_LRScheduler): ...@@ -700,24 +748,24 @@ class LinearLrWarmup(_LRScheduler):
self.end_lr = end_lr self.end_lr = end_lr
assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format( assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(
end_lr, start_lr) end_lr, start_lr)
super(LinearLrWarmup, self).__init__(start_lr, last_epoch, verbose) super(LinearWarmup, self).__init__(start_lr, last_epoch, verbose)
def get_lr(self): def get_lr(self):
if self.last_epoch < self.warmup_steps: if self.last_epoch < self.warmup_steps:
return (self.end_lr - self.start_lr) * float( return (self.end_lr - self.start_lr) * float(
self.last_epoch) / float(self.warmup_steps) + self.start_lr self.last_epoch) / float(self.warmup_steps) + self.start_lr
else: else:
if isinstance(self.learning_rate, _LRScheduler): if isinstance(self.learning_rate, LRScheduler):
self.learning_rate.step() self.learning_rate.step()
return self.learning_rate() return self.learning_rate()
return self.learning_rate return self.learning_rate
class ExponentialLR(_LRScheduler): class ExponentialDecay(LRScheduler):
""" """
Update learning rate by 'gamma' each epoch. Update learning rate by `gamma` each epoch.
The algorithm can be described as following. The algorithm can be described as following.
...@@ -733,7 +781,7 @@ class ExponentialLR(_LRScheduler): ...@@ -733,7 +781,7 @@ class ExponentialLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns: Returns:
``ExponentialLR`` instance to schedule learning rate. ``ExponentialDecay`` instance to schedule learning rate.
Examples: Examples:
...@@ -742,23 +790,21 @@ class ExponentialLR(_LRScheduler): ...@@ -742,23 +790,21 @@ class ExponentialLR(_LRScheduler):
import paddle import paddle
import numpy as np import numpy as np
# train on default dygraph mode # train on default dynamic graph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True) scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20): for epoch in range(20):
for batch_id in range(2): for batch_id in range(2):
x = paddle.to_tensor(x) x = paddle.uniform([10, 10])
out = linear(x) out = linear(x)
loss = paddle.reduce_mean(out) loss = paddle.reduce_mean(out)
loss.backward() loss.backward()
sgd.minimize(loss) sgd.step()
linear.clear_gradients() sgd.clear_gradients()
scheduler.step() scheduler.step()
# train on static mode # train on static graph mode
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
start_prog = paddle.static.Program() start_prog = paddle.static.Program()
...@@ -767,7 +813,7 @@ class ExponentialLR(_LRScheduler): ...@@ -767,7 +813,7 @@ class ExponentialLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5]) y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100) z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z) loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True) scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss) sgd.minimize(loss)
...@@ -787,15 +833,16 @@ class ExponentialLR(_LRScheduler): ...@@ -787,15 +833,16 @@ class ExponentialLR(_LRScheduler):
def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False): def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
self.gamma = gamma self.gamma = gamma
super(ExponentialLR, self).__init__(learning_rate, last_epoch, verbose) super(ExponentialDecay, self).__init__(learning_rate, last_epoch,
verbose)
def get_lr(self): def get_lr(self):
return self.base_lr * (self.gamma**self.last_epoch) return self.base_lr * (self.gamma**self.last_epoch)
class MultiStepLR(_LRScheduler): class MultiStepDecay(LRScheduler):
""" """
Update the learning rate by ``gama`` once ``epoch`` reaches one of the milestones. Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.
The algorithm can be described as the code below. The algorithm can be described as the code below.
...@@ -821,7 +868,7 @@ class MultiStepLR(_LRScheduler): ...@@ -821,7 +868,7 @@ class MultiStepLR(_LRScheduler):
Returns: Returns:
``MultiStepLR`` instance to schedule learning rate. ``MultiStepDecay`` instance to schedule learning rate.
Examples: Examples:
...@@ -830,23 +877,21 @@ class MultiStepLR(_LRScheduler): ...@@ -830,23 +877,21 @@ class MultiStepLR(_LRScheduler):
import paddle import paddle
import numpy as np import numpy as np
# train on default dygraph mode # train on default dynamic graph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True) scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20): for epoch in range(20):
for batch_id in range(2): for batch_id in range(2):
x = paddle.to_tensor(x) x = paddle.uniform([10, 10])
out = linear(x) out = linear(x)
loss = paddle.reduce_mean(out) loss = paddle.reduce_mean(out)
loss.backward() loss.backward()
sgd.minimize(loss) sgd.step()
linear.clear_gradients() sgd.clear_gradients()
scheduler.step() scheduler.step()
# train on static mode # train on static graph mode
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
start_prog = paddle.static.Program() start_prog = paddle.static.Program()
...@@ -855,7 +900,7 @@ class MultiStepLR(_LRScheduler): ...@@ -855,7 +900,7 @@ class MultiStepLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5]) y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100) z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z) loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True) scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss) sgd.minimize(loss)
...@@ -894,7 +939,7 @@ class MultiStepLR(_LRScheduler): ...@@ -894,7 +939,7 @@ class MultiStepLR(_LRScheduler):
self.milestones = milestones self.milestones = milestones
self.gamma = gamma self.gamma = gamma
super(MultiStepLR, self).__init__(learning_rate, last_epoch, verbose) super(MultiStepDecay, self).__init__(learning_rate, last_epoch, verbose)
def get_lr(self): def get_lr(self):
for i in range(len(self.milestones)): for i in range(len(self.milestones)):
...@@ -903,7 +948,7 @@ class MultiStepLR(_LRScheduler): ...@@ -903,7 +948,7 @@ class MultiStepLR(_LRScheduler):
return self.base_lr * (self.gamma**len(self.milestones)) return self.base_lr * (self.gamma**len(self.milestones))
class StepLR(_LRScheduler): class StepDecay(LRScheduler):
""" """
Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch. Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch.
...@@ -929,7 +974,7 @@ class StepLR(_LRScheduler): ...@@ -929,7 +974,7 @@ class StepLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns: Returns:
``StepLR`` instance to schedule learning rate. ``StepDecay`` instance to schedule learning rate.
Examples: Examples:
...@@ -939,23 +984,21 @@ class StepLR(_LRScheduler): ...@@ -939,23 +984,21 @@ class StepLR(_LRScheduler):
import paddle import paddle
import numpy as np import numpy as np
# train on default dygraph mode # train on default dynamic graph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True) scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20): for epoch in range(20):
for batch_id in range(2): for batch_id in range(2):
x = paddle.to_tensor(x) x = paddle.uniform([10, 10])
out = linear(x) out = linear(x)
loss = paddle.reduce_mean(out) loss = paddle.reduce_mean(out)
loss.backward() loss.backward()
sgd.minimize(loss) sgd.step()
linear.clear_gradients() sgd.clear_gradients()
scheduler.step() scheduler.step()
# train on static mode # train on static graph mode
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
start_prog = paddle.static.Program() start_prog = paddle.static.Program()
...@@ -964,7 +1007,7 @@ class StepLR(_LRScheduler): ...@@ -964,7 +1007,7 @@ class StepLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5]) y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100) z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z) loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True) scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss) sgd.minimize(loss)
...@@ -997,14 +1040,14 @@ class StepLR(_LRScheduler): ...@@ -997,14 +1040,14 @@ class StepLR(_LRScheduler):
self.step_size = step_size self.step_size = step_size
self.gamma = gamma self.gamma = gamma
super(StepLR, self).__init__(learning_rate, last_epoch, verbose) super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)
def get_lr(self): def get_lr(self):
i = self.last_epoch // self.step_size i = self.last_epoch // self.step_size
return self.base_lr * (self.gamma**i) return self.base_lr * (self.gamma**i)
class LambdaLR(_LRScheduler): class LambdaDecay(LRScheduler):
""" """
Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` . Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .
...@@ -1015,9 +1058,9 @@ class LambdaLR(_LRScheduler): ...@@ -1015,9 +1058,9 @@ class LambdaLR(_LRScheduler):
learning_rate = 0.5 # init learning_rate learning_rate = 0.5 # init learning_rate
lr_lambda = lambda epoch: 0.95 ** epoch lr_lambda = lambda epoch: 0.95 ** epoch
learning_rate = 0.5 # epoch 0 learning_rate = 0.5 # epoch 0, 0.5*0.95**0
learning_rate = 0.475 # epoch 1 learning_rate = 0.475 # epoch 1, 0.5*0.95**1
learning_rate = 0.45125 # epoch 2 learning_rate = 0.45125 # epoch 2, 0.5*0.95**2
Args: Args:
learning_rate (float): The initial learning rate. It is a python float number. learning_rate (float): The initial learning rate. It is a python float number.
...@@ -1026,7 +1069,7 @@ class LambdaLR(_LRScheduler): ...@@ -1026,7 +1069,7 @@ class LambdaLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns: Returns:
``LambdaLR`` instance to schedule learning rate. ``LambdaDecay`` instance to schedule learning rate.
Examples: Examples:
...@@ -1035,23 +1078,21 @@ class LambdaLR(_LRScheduler): ...@@ -1035,23 +1078,21 @@ class LambdaLR(_LRScheduler):
import paddle import paddle
import numpy as np import numpy as np
# train on default dygraph mode # train on default dynamic graph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True) scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20): for epoch in range(20):
for batch_id in range(2): for batch_id in range(2):
x = paddle.to_tensor(x) x = paddle.uniform([10, 10])
out = linear(x) out = linear(x)
loss = paddle.reduce_mean(out) loss = paddle.reduce_mean(out)
loss.backward() loss.backward()
sgd.minimize(loss) sgd.step()
linear.clear_gradients() sgd.clear_gradients()
scheduler.step() scheduler.step()
# train on static mode # train on static graph mode
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
start_prog = paddle.static.Program() start_prog = paddle.static.Program()
...@@ -1060,7 +1101,7 @@ class LambdaLR(_LRScheduler): ...@@ -1060,7 +1101,7 @@ class LambdaLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5]) y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100) z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z) loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True) scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss) sgd.minimize(loss)
...@@ -1082,17 +1123,17 @@ class LambdaLR(_LRScheduler): ...@@ -1082,17 +1123,17 @@ class LambdaLR(_LRScheduler):
def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False): def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
if not callable(lr_lambda): if not callable(lr_lambda):
raise TypeError( raise TypeError(
"The type of 'lr_lambda' in 'LambdaLR' must be 'function', but received %s." "The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s."
% type(lr_lambda)) % type(lr_lambda))
self.lr_lambda = lr_lambda self.lr_lambda = lr_lambda
super(LambdaLR, self).__init__(learning_rate, last_epoch, verbose) super(LambdaDecay, self).__init__(learning_rate, last_epoch, verbose)
def get_lr(self): def get_lr(self):
return self.base_lr * self.lr_lambda(self.last_epoch) return self.base_lr * self.lr_lambda(self.last_epoch)
class ReduceLROnPlateau(_LRScheduler): class ReduceOnPlateau(LRScheduler):
""" """
Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate
by 2 to 10 times once model performance has no longer improvement. by 2 to 10 times once model performance has no longer improvement.
...@@ -1126,7 +1167,7 @@ class ReduceLROnPlateau(_LRScheduler): ...@@ -1126,7 +1167,7 @@ class ReduceLROnPlateau(_LRScheduler):
Returns: Returns:
``ReduceLROnPlateau`` instance to schedule learning rate. ``ReduceOnPlateau`` instance to schedule learning rate.
Examples: Examples:
...@@ -1135,23 +1176,21 @@ class ReduceLROnPlateau(_LRScheduler): ...@@ -1135,23 +1176,21 @@ class ReduceLROnPlateau(_LRScheduler):
import paddle import paddle
import numpy as np import numpy as np
# train on default dygraph mode # train on default dynamic graph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True) scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20): for epoch in range(20):
for batch_id in range(2): for batch_id in range(2):
x = paddle.to_tensor(x) x = paddle.uniform([10, 10])
out = linear(x) out = linear(x)
loss = paddle.reduce_mean(out) loss = paddle.reduce_mean(out)
loss.backward() loss.backward()
sgd.minimize(loss) sgd.step()
linear.clear_gradients() sgd.clear_gradients()
scheduler.step(loss) scheduler.step(loss)
# train on static mode # train on static graph mode
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
start_prog = paddle.static.Program() start_prog = paddle.static.Program()
...@@ -1160,7 +1199,7 @@ class ReduceLROnPlateau(_LRScheduler): ...@@ -1160,7 +1199,7 @@ class ReduceLROnPlateau(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5]) y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100) z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z) loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True) scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss) sgd.minimize(loss)
...@@ -1207,7 +1246,7 @@ class ReduceLROnPlateau(_LRScheduler): ...@@ -1207,7 +1246,7 @@ class ReduceLROnPlateau(_LRScheduler):
self.threshold_mode = threshold_mode self.threshold_mode = threshold_mode
if not isinstance(learning_rate, (float, int)): if not isinstance(learning_rate, (float, int)):
raise TypeError( raise TypeError(
"The type of 'learning_rate' in 'ReduceLROnPlateau' must be 'float', but received %s." "The type of 'learning_rate' in 'ReduceOnPlateau' must be 'float', but received %s."
% type(learning_rate)) % type(learning_rate))
self.verbose = verbose self.verbose = verbose
...@@ -1230,7 +1269,7 @@ class ReduceLROnPlateau(_LRScheduler): ...@@ -1230,7 +1269,7 @@ class ReduceLROnPlateau(_LRScheduler):
self._var_name = None self._var_name = None
# "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored. # "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored.
def _state_keys(self): def state_keys(self):
self.keys = [ self.keys = [
'cooldown_counter', 'best', 'num_bad_epochs', 'last_epoch', 'cooldown_counter', 'best', 'num_bad_epochs', 'last_epoch',
'last_lr' 'last_lr'
...@@ -1238,7 +1277,7 @@ class ReduceLROnPlateau(_LRScheduler): ...@@ -1238,7 +1277,7 @@ class ReduceLROnPlateau(_LRScheduler):
def step(self, metrics, epoch=None): def step(self, metrics, epoch=None):
""" """
step should be called after 'minimize' . It will update the learning rate in optimizer according to ``metrics`` . step should be called after `optimizer.step()` . It will update the learning rate in optimizer according to ``metrics`` .
The new learning rate will take effect on next epoch. The new learning rate will take effect on next epoch.
Args: Args:
...@@ -1251,14 +1290,14 @@ class ReduceLROnPlateau(_LRScheduler): ...@@ -1251,14 +1290,14 @@ class ReduceLROnPlateau(_LRScheduler):
None None
Examples: Examples:
Please refer to the example of current _LRScheduler. Please refer to the example of current LRScheduler.
""" """
if epoch is None: if epoch is None:
self.last_epoch = self.last_epoch + 1 self.last_epoch = self.last_epoch + 1
else: else:
self.last_epoch = epoch self.last_epoch = epoch
# loss must be 1-D Tensor with shape [1] # loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
if isinstance(metrics, (Tensor, numpy.ndarray)): if isinstance(metrics, (Tensor, numpy.ndarray)):
assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \ assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
"should be (1L,), but the current metrics.shape is {}. Maybe that " \ "should be (1L,), but the current metrics.shape is {}. Maybe that " \
...@@ -1290,7 +1329,6 @@ class ReduceLROnPlateau(_LRScheduler): ...@@ -1290,7 +1329,6 @@ class ReduceLROnPlateau(_LRScheduler):
self.last_lr)) self.last_lr))
def _is_better(self, current, best): def _is_better(self, current, best):
print("mode", self.mode, 'threshold_mode', self.threshold_mode)
if self.mode == 'min' and self.threshold_mode == 'rel': if self.mode == 'min' and self.threshold_mode == 'rel':
return current < best - best * self.threshold return current < best - best * self.threshold
...@@ -1304,31 +1342,23 @@ class ReduceLROnPlateau(_LRScheduler): ...@@ -1304,31 +1342,23 @@ class ReduceLROnPlateau(_LRScheduler):
return current > best + self.threshold return current > best + self.threshold
class CosineAnnealingLR(_LRScheduler): class CosineAnnealingDecay(LRScheduler):
""" """
Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to
the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in
SGDR: SGDR.
\begin{aligned}
\eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+ \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
& T_{cur} \neq (2k+1)T_{max}; \\
\eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
\left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
& T_{cur} = (2k+1)T_{max}.
\end{aligned}
The algorithm can be described as following. The algorithm can be described as following.
.. math:: .. math::
\begin{aligned}
\eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 \\begin{aligned}
+ \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right), \eta_t & = \eta_{min} + \\frac{1}{2}(\eta_{max} - \eta_{min})\left(1
& T_{cur} \neq (2k+1)T_{max}; \\ + \cos\left(\\frac{T_{cur}}{T_{max}}\pi\\right)\\right),
\eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min}) & T_{cur} \\neq (2k+1)T_{max}; \\
\left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right), \eta_{t+1} & = \eta_{t} + \\frac{1}{2}(\eta_{max} - \eta_{min})
\left(1 - \cos\left(\\frac{1}{T_{max}}\pi\\right)\\right),
& T_{cur} = (2k+1)T_{max}. & T_{cur} = (2k+1)T_{max}.
\end{aligned} \end{aligned}
...@@ -1343,7 +1373,7 @@ class CosineAnnealingLR(_LRScheduler): ...@@ -1343,7 +1373,7 @@ class CosineAnnealingLR(_LRScheduler):
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
Returns: Returns:
``CosineAnnealingLR`` instance to schedule learning rate. ``CosineAnnealingDecay`` instance to schedule learning rate.
Examples: Examples:
...@@ -1352,23 +1382,21 @@ class CosineAnnealingLR(_LRScheduler): ...@@ -1352,23 +1382,21 @@ class CosineAnnealingLR(_LRScheduler):
import paddle import paddle
import numpy as np import numpy as np
# train on default dygraph mode # train on default dynamic graph mode
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True) scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20): for epoch in range(20):
for batch_id in range(2): for batch_id in range(2):
x = paddle.to_tensor(x) x = paddle.uniform([10, 10])
out = linear(x) out = linear(x)
loss = paddle.reduce_mean(out) loss = paddle.reduce_mean(out)
loss.backward() loss.backward()
sgd.minimize(loss) sgd.step()
linear.clear_gradients() sgd.clear_gradients()
scheduler.step() scheduler.step()
# train on static mode # train on static graph mode
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
start_prog = paddle.static.Program() start_prog = paddle.static.Program()
...@@ -1377,7 +1405,7 @@ class CosineAnnealingLR(_LRScheduler): ...@@ -1377,7 +1405,7 @@ class CosineAnnealingLR(_LRScheduler):
y = paddle.static.data(name='y', shape=[None, 4, 5]) y = paddle.static.data(name='y', shape=[None, 4, 5])
z = paddle.static.nn.fc(x, 100) z = paddle.static.nn.fc(x, 100)
loss = paddle.mean(z) loss = paddle.mean(z)
scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True) scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd = paddle.optimizer.SGD(learning_rate=scheduler)
sgd.minimize(loss) sgd.minimize(loss)
...@@ -1403,16 +1431,16 @@ class CosineAnnealingLR(_LRScheduler): ...@@ -1403,16 +1431,16 @@ class CosineAnnealingLR(_LRScheduler):
verbose=False): verbose=False):
if not isinstance(T_max, int): if not isinstance(T_max, int):
raise TypeError( raise TypeError(
"The type of 'T_max' in 'CosineAnnealingLR' must be 'int', but received %s." "The type of 'T_max' in 'CosineAnnealingDecay' must be 'int', but received %s."
% type(T_max)) % type(T_max))
if not isinstance(eta_min, (float, int)): if not isinstance(eta_min, (float, int)):
raise TypeError( raise TypeError(
"The type of 'eta_min' in 'CosineAnnealingLR' must be 'float, int', but received %s." "The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s."
% type(eta_min)) % type(eta_min))
self.T_max = T_max self.T_max = T_max
self.eta_min = float(eta_min) self.eta_min = float(eta_min)
super(CosineAnnealingLR, self).__init__(learning_rate, last_epoch, super(CosineAnnealingDecay, self).__init__(learning_rate, last_epoch,
verbose) verbose)
def get_lr(self): def get_lr(self):
if self.last_epoch == 0: if self.last_epoch == 0:
......
...@@ -41,7 +41,7 @@ from paddle.fluid.layers import tensor ...@@ -41,7 +41,7 @@ from paddle.fluid.layers import tensor
from functools import reduce from functools import reduce
from ..fluid.wrapped_decorator import signature_safe_contextmanager from ..fluid.wrapped_decorator import signature_safe_contextmanager
from .. import compat as cpt from .. import compat as cpt
from .lr_scheduler import _LRScheduler from .lr import LRScheduler
__all__ = ['Optimizer'] __all__ = ['Optimizer']
...@@ -54,8 +54,8 @@ class Optimizer(object): ...@@ -54,8 +54,8 @@ class Optimizer(object):
but need to use one of it's implementation. but need to use one of it's implementation.
Args: Args:
learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``. learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
It can be a float value or any subclass of ``_LRScheduler`` . It can be a float value or any subclass of ``LRScheduler`` .
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \ This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated. The default value is None in static mode, at this time all parameters will be updated.
...@@ -82,12 +82,8 @@ class Optimizer(object): ...@@ -82,12 +82,8 @@ class Optimizer(object):
#Take the subclass adam as an example #Take the subclass adam as an example
import paddle import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp) inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear(inp) out = linear(inp)
loss = paddle.mean(out) loss = paddle.mean(out)
adam = paddle.optimizer.Adam(learning_rate=0.1, adam = paddle.optimizer.Adam(learning_rate=0.1,
...@@ -121,9 +117,9 @@ class Optimizer(object): ...@@ -121,9 +117,9 @@ class Optimizer(object):
"The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!" "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
% weight_decay.__str__()) % weight_decay.__str__())
break break
if not isinstance(learning_rate, (float, _LRScheduler)): if not isinstance(learning_rate, (float, LRScheduler)):
raise TypeError( raise TypeError(
"learning rate should be float or _LRScheduler, got %s here" % "learning rate should be float or LRScheduler, got %s here" %
type(learning_rate)) type(learning_rate))
if grad_clip is not None: if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase): if not isinstance(grad_clip, GradientClipBase):
...@@ -156,7 +152,7 @@ class Optimizer(object): ...@@ -156,7 +152,7 @@ class Optimizer(object):
@framework.dygraph_only @framework.dygraph_only
def state_dict(self): def state_dict(self):
''' '''
Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be include in state dict. Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be include in state dict.
If the optimizer never be called(minimize function), the state_dict is empty. If the optimizer never be called(minimize function), the state_dict is empty.
Args: Args:
...@@ -169,7 +165,6 @@ class Optimizer(object): ...@@ -169,7 +165,6 @@ class Optimizer(object):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.disable_static()
emb = paddle.nn.Embedding(10, 10) emb = paddle.nn.Embedding(10, 10)
adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
...@@ -181,14 +176,14 @@ class Optimizer(object): ...@@ -181,14 +176,14 @@ class Optimizer(object):
for para_name, var_tmp in v.items(): for para_name, var_tmp in v.items():
state_dict[var_tmp.name] = var_tmp state_dict[var_tmp.name] = var_tmp
# global step if use lr decay # global step if use lr decay
if isinstance(self._learning_rate, _LRScheduler): if isinstance(self._learning_rate, LRScheduler):
state_dict["LR_Scheduler"] = self._learning_rate.state_dict() state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
return state_dict return state_dict
@framework.dygraph_only @framework.dygraph_only
def set_state_dict(self, state_dict): def set_state_dict(self, state_dict):
''' '''
Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be changed. Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be changed.
Args: Args:
state_dict(dict) : Dict contains all the Tensor needed by optimizer state_dict(dict) : Dict contains all the Tensor needed by optimizer
...@@ -199,26 +194,28 @@ class Optimizer(object): ...@@ -199,26 +194,28 @@ class Optimizer(object):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.disable_static()
emb = paddle.nn.Embedding(10, 10)
state_dict = emb.state_dict() emb = paddle.nn.Embedding(10, 10)
paddle.framework.save(state_dict, "paddle_dy")
adam = paddle.optimizer.Adam(learning_rate=paddle.optimizer.NoamLR( 100, 10000), layer_state_dict = emb.state_dict()
parameters=emb.parameters()) paddle.save(layer_state_dict, "emb.pdparams")
state_dict = adam.state_dict()
paddle.framework.save(state_dict, "paddle_dy")
para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy") scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam(
learning_rate=scheduler,
parameters=emb.parameters())
opt_state_dict = adam.state_dict()
paddle.save(opt_state_dict, "adam.pdopt")
opti_state_dict = paddle.load("adam.pdopt")
adam.set_state_dict(opti_state_dict) adam.set_state_dict(opti_state_dict)
''' '''
if isinstance(self._learning_rate, _LRScheduler): if isinstance(self._learning_rate, LRScheduler):
self._learning_rate.set_dict(state_dict["LR_Scheduler"]) self._learning_rate.set_dict(state_dict["LR_Scheduler"])
if isinstance(self._learning_rate, _LRScheduler): if isinstance(self._learning_rate, LRScheduler):
self._learning_rate.set_state_dict(state_dict["LR_Scheduler"]) self._learning_rate.set_state_dict(state_dict["LR_Scheduler"])
self._accumulators_holder = state_dict self._accumulators_holder = state_dict
...@@ -256,7 +253,7 @@ class Optimizer(object): ...@@ -256,7 +253,7 @@ class Optimizer(object):
return self._opti_name_list return self._opti_name_list
def _create_global_learning_rate(self): def _create_global_learning_rate(self):
if isinstance(self._learning_rate, _LRScheduler): if isinstance(self._learning_rate, LRScheduler):
lr_var = self._global_learning_rate() lr_var = self._global_learning_rate()
# only create global lr_var once # only create global lr_var once
if not isinstance(lr_var, framework.Variable): if not isinstance(lr_var, framework.Variable):
...@@ -299,7 +296,7 @@ class Optimizer(object): ...@@ -299,7 +296,7 @@ class Optimizer(object):
""" """
:api_attr: imperative :api_attr: imperative
Set the value of the learning rate manually in the optimizer. If the optimizer use _LRScheduler, Set the value of the learning rate manually in the optimizer. If the optimizer use LRScheduler,
this API cannot be invoked, because it will lead to conflict. this API cannot be invoked, because it will lead to conflict.
Args: Args:
...@@ -312,7 +309,6 @@ class Optimizer(object): ...@@ -312,7 +309,6 @@ class Optimizer(object):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.disable_static()
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
...@@ -335,9 +331,9 @@ class Optimizer(object): ...@@ -335,9 +331,9 @@ class Optimizer(object):
raise TypeError( raise TypeError(
"The type of 'value' in optimizer.set_lr must be float, but received %s." "The type of 'value' in optimizer.set_lr must be float, but received %s."
% (type(value))) % (type(value)))
if isinstance(self._learning_rate, _LRScheduler): if isinstance(self._learning_rate, LRScheduler):
raise RuntimeError( raise RuntimeError(
"optimizer's learning rate can't be _LRScheduler when invoke this API, because this will lead to conflict." "optimizer's learning rate can't be LRScheduler when invoke this API, because this will lead to conflict."
) )
self._learning_rate = float(value) self._learning_rate = float(value)
current_lr = self._global_learning_rate() current_lr = self._global_learning_rate()
...@@ -358,7 +354,7 @@ class Optimizer(object): ...@@ -358,7 +354,7 @@ class Optimizer(object):
""" """
:api_attr: imperative :api_attr: imperative
Get current step learning rate. The return value is all the same When _LRScheduler is not used, Get current step learning rate. The return value is all the same When LRScheduler is not used,
otherwise return the current step learning rate. otherwise return the current step learning rate.
...@@ -370,15 +366,13 @@ class Optimizer(object): ...@@ -370,15 +366,13 @@ class Optimizer(object):
import numpy as np import numpy as np
import paddle import paddle
# example1: _LRScheduler is not used, return value is all the same # example1: LRScheduler is not used, return value is all the same
paddle.disable_static()
emb = paddle.nn.Embedding(10, 10) emb = paddle.nn.Embedding(10, 10)
adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters()) adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters())
lr = adam.get_lr() lr = adam.get_lr()
print(lr) # 0.001 print(lr) # 0.001
# example2: PiecewiseLR is used, return the step learning rate # example2: PiecewiseDecay is used, return the scheduled learning rate
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp) inp = paddle.to_tensor(inp)
...@@ -387,7 +381,7 @@ class Optimizer(object): ...@@ -387,7 +381,7 @@ class Optimizer(object):
bd = [2, 4, 6, 8] bd = [2, 4, 6, 8]
value = [0.2, 0.4, 0.6, 0.8, 1.0] value = [0.2, 0.4, 0.6, 0.8, 1.0]
scheduler = paddle.optimizer.PiecewiseLR(bd, value, 0) scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value, 0)
adam = paddle.optimizer.Adam(scheduler, adam = paddle.optimizer.Adam(scheduler,
parameters=linear.parameters()) parameters=linear.parameters())
...@@ -656,7 +650,6 @@ class Optimizer(object): ...@@ -656,7 +650,6 @@ class Optimizer(object):
import paddle import paddle
import numpy as np import numpy as np
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32") value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value) a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5) linear = paddle.nn.Linear(13, 5)
...@@ -727,7 +720,6 @@ class Optimizer(object): ...@@ -727,7 +720,6 @@ class Optimizer(object):
import paddle import paddle
import numpy as np import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp) inp = paddle.to_tensor(inp)
...@@ -805,7 +797,7 @@ class Optimizer(object): ...@@ -805,7 +797,7 @@ class Optimizer(object):
import numpy as np import numpy as np
import paddle import paddle
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32") value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value) a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5) linear = paddle.nn.Linear(13, 5)
...@@ -854,13 +846,9 @@ class Optimizer(object): ...@@ -854,13 +846,9 @@ class Optimizer(object):
.. code-block:: python .. code-block:: python
import paddle import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp) input = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear(inp) out = linear(input)
loss = paddle.mean(out) loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32") beta1 = paddle.to_tensor([0.9], dtype="float32")
...@@ -903,7 +891,7 @@ class Optimizer(object): ...@@ -903,7 +891,7 @@ class Optimizer(object):
import paddle import paddle
import numpy as np import numpy as np
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32") value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value) a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5) linear = paddle.nn.Linear(13, 5)
......
...@@ -69,8 +69,8 @@ class RMSProp(Optimizer): ...@@ -69,8 +69,8 @@ class RMSProp(Optimizer):
Parameters: Parameters:
learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``. learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler. It can be a float value or a LRScheduler.
rho(float): rho is :math: `\\rho` in equation, default is 0.95. rho(float): rho is :math: `\\rho` in equation, default is 0.95.
epsilon(float): :math: `\\epsilon` in equation is smoothing term to epsilon(float): :math: `\\epsilon` in equation is smoothing term to
avoid division by zero, default is 1e-6. avoid division by zero, default is 1e-6.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册