未验证 提交 e122e164 编写于 作者: Z Zhou Wei 提交者: GitHub

fix english doc, unittest, and remove useless alias of 2.0 lr_scheduler (#27686)

* fix doc and unittest of 2.0 lr_scheduler

* fix doc of 2.0 lr_scheduler

* fix unittest

* fix english doc of lr_scheduler

* fix api name of lr scheduler

* fix api name of lr scheduler
上级 9215ad96
......@@ -237,13 +237,6 @@ from .framework import save #DEFINE_ALIAS
from .framework import load #DEFINE_ALIAS
from .framework import DataParallel #DEFINE_ALIAS
from .framework import NoamDecay #DEFINE_ALIAS
from .framework import PiecewiseDecay #DEFINE_ALIAS
from .framework import NaturalExpDecay #DEFINE_ALIAS
from .framework import ExponentialDecay #DEFINE_ALIAS
from .framework import InverseTimeDecay #DEFINE_ALIAS
from .framework import PolynomialDecay #DEFINE_ALIAS
from .framework import CosineDecay #DEFINE_ALIAS
from .framework import set_default_dtype #DEFINE_ALIAS
from .framework import get_default_dtype #DEFINE_ALIAS
......
......@@ -164,7 +164,7 @@ def load_dygraph(model_path, **configs):
state_dict = emb.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy")
scheduler = paddle.optimizer.lr_scheduler.NoamLR(
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam(
learning_rate=scheduler,
......
......@@ -855,7 +855,7 @@ class Executor(object):
def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
return_numpy, return_merged):
from paddle.optimizer.lr_scheduler import _LRScheduler
from paddle.optimizer.lr import LRScheduler
exe = program._executor
# TODO(zhenghuihuang): quantization uses Graph in CompiledProgram
# instead of program. We will add support for checking Vars in Graph
......@@ -901,7 +901,7 @@ class Executor(object):
if hasattr(program._program, 'lr_sheduler'):
lr_sheduler = program._program.lr_sheduler
assert isinstance(lr_sheduler, _LRScheduler), "must be _LRScheduler"
assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
lr_value = lr_sheduler()
lr_var = program._program.global_block().vars[lr_sheduler._var_name]
lr_tensor = _as_lodtensor(lr_value, core.CPUPlace(), lr_var.dtype)
......@@ -1238,7 +1238,7 @@ class Executor(object):
def _run_program(self, program, feed, fetch_list, feed_var_name,
fetch_var_name, scope, return_numpy, use_program_cache):
from paddle.optimizer.lr_scheduler import _LRScheduler
from paddle.optimizer.lr import LRScheduler
if feed is None:
feed = {}
elif isinstance(feed, (list, tuple)):
......@@ -1296,7 +1296,7 @@ class Executor(object):
self._feed_data(program, feed, feed_var_name, scope)
if hasattr(program, 'lr_sheduler'):
assert isinstance(program.lr_sheduler,
_LRScheduler), "must be _LRScheduler"
LRScheduler), "must be LRScheduler"
lr_sheduler = program.lr_sheduler
lr_value = lr_sheduler()
lr_var = program.global_block().vars[lr_sheduler._var_name]
......
......@@ -70,15 +70,15 @@ class Optimizer(object):
grad_clip=None,
name=None):
# Because of the loop import, so place it in the function body
from paddle.optimizer.lr_scheduler import _LRScheduler
from paddle.optimizer.lr import LRScheduler
self._parameter_list = list(
parameter_list) if parameter_list is not None else None
self._name = name
if framework.in_dygraph_mode():
if not isinstance(learning_rate,
(float, LearningRateDecay, _LRScheduler)):
(float, LearningRateDecay, LRScheduler)):
raise TypeError(
"learning rate should be float or _LRScheduler, got %s here"
"learning rate should be float or LRScheduler, got %s here"
% type(learning_rate))
if self._parameter_list is None:
raise AttributeError(
......@@ -94,9 +94,9 @@ class Optimizer(object):
break
else:
if not isinstance(learning_rate,
(float, framework.Variable, _LRScheduler)):
(float, framework.Variable, LRScheduler)):
raise TypeError(
"learning rate should be float or _LRScheduler, got %s here"
"learning rate should be float or LRScheduler, got %s here"
% type(learning_rate))
if grad_clip is not None:
......@@ -147,13 +147,13 @@ class Optimizer(object):
state_dict = adam.state_dict()
'''
from paddle.optimizer.lr_scheduler import _LRScheduler
from paddle.optimizer.lr import LRScheduler
state_dict = {}
for k, v in self._accumulators.items():
for para_name, var_tmp in v.items():
state_dict[var_tmp.name] = var_tmp
# global step if use lr decay
if isinstance(self._learning_rate, _LRScheduler):
if isinstance(self._learning_rate, LRScheduler):
state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
return state_dict
if isinstance(self._learning_rate, LearningRateDecay):
......@@ -193,7 +193,7 @@ class Optimizer(object):
state_dict = emb.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy")
scheduler = paddle.optimizer.lr_scheduler.NoamLR(
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam(
learning_rate=scheduler,
......@@ -203,8 +203,8 @@ class Optimizer(object):
para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
'''
from paddle.optimizer.lr_scheduler import _LRScheduler
if isinstance(self._learning_rate, _LRScheduler):
from paddle.optimizer.lr import LRScheduler
if isinstance(self._learning_rate, LRScheduler):
self._learning_rate.set_dict(state_dict["LR_Scheduler"])
if isinstance(self._learning_rate, LearningRateDecay):
......@@ -269,8 +269,8 @@ class Optimizer(object):
return self._opti_name_list
def _create_global_learning_rate(self):
from paddle.optimizer.lr_scheduler import _LRScheduler
if isinstance(self._learning_rate, _LRScheduler):
from paddle.optimizer.lr import LRScheduler
if isinstance(self._learning_rate, LRScheduler):
lr_var = self._global_learning_rate()
# only create global lr_var once
if not isinstance(lr_var, framework.Variable):
......
......@@ -455,8 +455,8 @@ class TestAdamOpV2(unittest.TestCase):
state_dict = adam.state_dict()
adam.set_state_dict(state_dict)
#learning_rate is _LRScheduler
learning_rate = paddle.optimizer.CosineAnnealingLR(
#learning_rate is LRScheduler
learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(
learning_rate=0.1, T_max=10)
adam = paddle.optimizer.Adam(
learning_rate=learning_rate,
......
......@@ -43,14 +43,22 @@ class TestDirectory(unittest.TestCase):
'paddle.distributed.prepare_context', 'paddle.DataParallel',
'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
'paddle.jit.save', 'paddle.jit.load', 'paddle.NoamDecay',
'paddle.PiecewiseDecay', 'paddle.NaturalExpDecay',
'paddle.ExponentialDecay', 'paddle.InverseTimeDecay',
'paddle.PolynomialDecay', 'paddle.CosineDecay',
'paddle.static.Executor', 'paddle.static.global_scope',
'paddle.static.scope_guard', 'paddle.static.append_backward',
'paddle.static.gradients', 'paddle.static.BuildStrategy',
'paddle.static.CompiledProgram', 'paddle.static.ExecutionStrategy',
'paddle.jit.save', 'paddle.jit.load',
'paddle.optimizer.lr.LRScheduler', 'paddle.optimizer.lr.NoamDecay',
'paddle.optimizer.lr.PiecewiseDecay',
'paddle.optimizer.lr.NaturalExpDecay',
'paddle.optimizer.lr.ExponentialDecay',
'paddle.optimizer.lr.InverseTimeDecay',
'paddle.optimizer.lr.PolynomialDecay',
'paddle.optimizer.lr.CosineAnnealingDecay',
'paddle.optimizer.lr.MultiStepDecay',
'paddle.optimizer.lr.StepDecay', 'paddle.optimizer.lr.LambdaDecay',
'paddle.optimizer.lr.ReduceOnPlateau',
'paddle.optimizer.lr.LinearWarmup', 'paddle.static.Executor',
'paddle.static.global_scope', 'paddle.static.scope_guard',
'paddle.static.append_backward', 'paddle.static.gradients',
'paddle.static.BuildStrategy', 'paddle.static.CompiledProgram',
'paddle.static.ExecutionStrategy',
'paddle.static.default_main_program',
'paddle.static.default_startup_program', 'paddle.static.Program',
'paddle.static.name_scope', 'paddle.static.program_guard',
......
......@@ -23,7 +23,7 @@ import itertools
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer, Adam, MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer
from paddle.fluid.optimizer import MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer
from paddle.fluid.optimizer import ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer
from paddle.fluid.dygraph import Linear
from paddle.fluid.dygraph.base import to_variable
......@@ -72,15 +72,17 @@ class TestImperativeOptimizerBase(unittest.TestCase):
place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace()
with fluid.dygraph.guard(place):
try:
paddle.manual_seed(seed)
paddle.framework.random._manual_program_seed(seed)
mlp = MLP()
optimizer = self.get_optimizer_dygraph(
parameter_list=mlp.parameters())
except Exception as e:
assert str(e) == exception_message
try:
paddle.disable_static()
paddle.manual_seed(seed)
paddle.framework.random._manual_program_seed(seed)
mlp = MLP()
optimizer = self.get_optimizer_dygraph(
parameter_list=mlp.parameters())
except Exception as e:
assert str(e) == exception_message
finally:
paddle.enable_static()
def _check_mlp(self, place=None):
seed = 90
......@@ -90,47 +92,55 @@ class TestImperativeOptimizerBase(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
with fluid.dygraph.guard(place):
paddle.manual_seed(seed)
paddle.framework.random._manual_program_seed(seed)
paddle.disable_static(place)
paddle.manual_seed(seed)
paddle.framework.random._manual_program_seed(seed)
mlp = MLP()
optimizer = self.get_optimizer_dygraph(
parameter_list=mlp.parameters())
mlp = MLP()
optimizer = self.get_optimizer_dygraph(parameter_list=mlp.parameters())
batch_py_reader = fluid.io.PyReader(capacity=1)
batch_py_reader.decorate_sample_list_generator(
paddle.batch(
self.reader_decorator(paddle.dataset.mnist.train()),
batch_size=batch_size,
drop_last=True),
places=fluid.CPUPlace())
batch_py_reader = fluid.io.PyReader(capacity=1)
batch_py_reader.decorate_sample_list_generator(
paddle.batch(
self.reader_decorator(paddle.dataset.mnist.train()),
batch_size=batch_size,
drop_last=True),
places=fluid.CPUPlace())
dy_param_init_value = {}
for batch_id, data in enumerate(batch_py_reader()):
if batch_id >= self.batch_num:
break
dy_param_init_value = {}
for batch_id, data in enumerate(batch_py_reader()):
if batch_id >= self.batch_num:
break
img = data[0]
label = data[1]
label.stop_gradient = True
img = data[0]
label = data[1]
img = fluid.layers.reshape(img, shape=[batch_size, -1])
cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost)
dy_out = avg_loss.numpy()
label.stop_gradient = True
if batch_id == 0:
for param in mlp.parameters():
dy_param_init_value[param.name] = param.numpy()
img = fluid.layers.reshape(img, shape=[batch_size, -1])
cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost)
dy_out = avg_loss.numpy()
avg_loss.backward()
optimizer.minimize(avg_loss)
mlp.clear_gradients()
dy_param_value = {}
if batch_id == 0:
for param in mlp.parameters():
dy_param_value[param.name] = param.numpy()
dy_param_init_value[param.name] = param.numpy()
avg_loss.backward()
optimizer.minimize(avg_loss)
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.LRScheduler):
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.ReduceOnPlateau):
optimizer._learning_rate.step(avg_loss)
else:
optimizer._learning_rate.step()
mlp.clear_gradients()
dy_param_value = {}
for param in mlp.parameters():
dy_param_value[param.name] = param.numpy()
paddle.enable_static()
with new_program_scope():
paddle.manual_seed(seed)
paddle.framework.random._manual_program_seed(seed)
......@@ -181,6 +191,13 @@ class TestImperativeOptimizerBase(unittest.TestCase):
feed={"pixel": static_x_data,
"label": y_data},
fetch_list=fetch_list)
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.LRScheduler):
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.ReduceOnPlateau):
optimizer._learning_rate.step(out[0])
else:
optimizer._learning_rate.step()
static_param_value = {}
static_out = out[0]
......@@ -199,17 +216,19 @@ class TestImperativeOptimizerBase(unittest.TestCase):
class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
bd = [3, 6, 9]
optimizer = SGDOptimizer(
learning_rate=paddle.optimizer.PiecewiseLR(
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd,
values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
parameter_list=parameter_list)
parameters=parameter_list)
return optimizer
def get_optimizer(self):
bd = [3, 6, 9]
optimizer = SGDOptimizer(learning_rate=paddle.optimizer.PiecewiseLR(
boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd,
values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
return optimizer
def test_sgd(self):
......@@ -218,21 +237,16 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.NaturalExpDecay(
learning_rate=0.5, gamma=0.9),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.NaturalExpDecay(
learning_rate=0.5, gamma=0.9))
return optimizer
def test_sgd(self):
......@@ -241,21 +255,16 @@ class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.exponential_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.ExponentialDecay(
learning_rate=0.5, gamma=0.9),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.ExponentialDecay(
learning_rate=0.5, gamma=0.9))
return optimizer
def test_sgd(self):
......@@ -264,21 +273,16 @@ class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = Adam(
learning_rate=fluid.layers.inverse_time_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
optimizer = paddle.optimizer.Adam(
learning_rate=paddle.optimizer.lr.InverseTimeDecay(
learning_rate=0.5, gamma=0.9),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
optimizer = paddle.optimizer.Adam(
learning_rate=paddle.optimizer.lr.InverseTimeDecay(
learning_rate=0.5, gamma=0.9))
return optimizer
def test_adam(self):
......@@ -287,15 +291,16 @@ class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.polynomial_decay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle),
parameter_list=parameter_list)
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.PolynomialDecay(
learning_rate=0.5, decay_steps=5, cycle=self.cycle),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle))
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.PolynomialDecay(
learning_rate=0.5, decay_steps=5, cycle=self.cycle))
return optimizer
def test_sgd_cycle(self):
......@@ -307,17 +312,18 @@ class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
self._check_mlp()
class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerCosineAnnealingDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.cosine_decay(
learning_rate=0.1, step_each_epoch=10000, epochs=120),
parameter_list=parameter_list)
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.CosineAnnealingDecay(
learning_rate=0.5, T_max=5),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
learning_rate=0.1, step_each_epoch=10000, epochs=120))
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.CosineAnnealingDecay(
learning_rate=0.5, T_max=5))
return optimizer
def test_sgd(self):
......@@ -326,15 +332,110 @@ class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.noam_decay(
d_model=512, warmup_steps=8000),
parameter_list=parameter_list)
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerLambdaDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.LambdaDecay(
learning_rate=0.5, lr_lambda=lambda epoch: 0.9**epoch),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.LambdaDecay(
learning_rate=0.5, lr_lambda=lambda epoch: 0.9**epoch))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerLinearWarmup(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.LinearWarmup(
learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.LinearWarmup(
learning_rate=0.5,
warmup_steps=20,
start_lr=0,
end_lr=0.5,
verbose=True))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerMultiStepDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerStepLR(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.StepDecay(
learning_rate=0.5, step_size=5, gamma=0.8),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.StepDecay(
learning_rate=0.5, step_size=5, gamma=0.8))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerReduceOnPlateau(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.ReduceOnPlateau(
learning_rate=0.5),
parameters=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
d_model=512, warmup_steps=8000))
optimizer = paddle.optimizer.SGD(
learning_rate=paddle.optimizer.lr.ReduceOnPlateau(
learning_rate=0.5))
return optimizer
def test_sgd(self):
......@@ -381,7 +482,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
bd = [2, 4, 6, 8]
value = [0.2, 0.4, 0.6, 0.8, 1.0]
scheduler = paddle.optimizer.PiecewiseLR(bd, value)
scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value)
adam = paddle.optimizer.Adam(
scheduler, parameters=linear.parameters())
......@@ -396,7 +497,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
scheduler.step()
def test_lr_decay_natural_exp(self):
def test_lr_scheduler_natural_exp(self):
with fluid.dygraph.guard():
a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
......@@ -407,8 +508,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
loss = fluid.layers.reduce_mean(b)
base_lr = 1.0
scheduler = paddle.optimizer.NaturalExpLR(1.0, gamma=0.5)
print("scheduler.last_lr", scheduler.last_lr)
scheduler = paddle.optimizer.lr.NaturalExpDecay(1.0, gamma=0.5)
adam = paddle.optimizer.Adam(
scheduler, parameters=linear.parameters())
......@@ -453,7 +553,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
with self.assertRaises(RuntimeError):
adam = paddle.optimizer.Adam(
paddle.optimizer.NaturalExpLR(
paddle.optimizer.lr.NaturalExpDecay(
learning_rate=0.1, gamma=0.5),
parameters=linear.parameters())
adam.set_lr(0.01)
......@@ -695,10 +795,10 @@ class TestImperativeOptimizerList(unittest.TestCase):
linear_1 = Linear(10, 10)
linear_2 = Linear(10, 10)
sgd = SGDOptimizer(
1.0,
parameter_list=itertools.chain(linear_1.parameters(),
linear_2.parameters()))
sgd = paddle.optimizer.SGD(1.0,
parameters=itertools.chain(
linear_1.parameters(),
linear_2.parameters()))
in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
in_data = fluid.dygraph.to_variable(in_np)
......
......@@ -239,7 +239,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR(
scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr)
adam = Adam(
learning_rate=scheduler, parameters=ptb_model.parameters())
......@@ -328,7 +328,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR(
scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr)
adam = Adam(
learning_rate=scheduler, parameters=ptb_model.parameters())
......@@ -436,7 +436,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR(
scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr)
adam = Adam(
learning_rate=scheduler, parameters=ptb_model.parameters())
......@@ -544,7 +544,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR(
scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr)
adam = Adam(
learning_rate=scheduler, parameters=ptb_model.parameters())
......@@ -829,7 +829,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
scheduler = paddle.optimizer.PiecewiseLR(
scheduler = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr_arr)
adam = Adam(
learning_rate=scheduler,
......
......@@ -56,22 +56,22 @@ def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss,
return var_list[1]
class TestReduceLROnPlateauDecay(object):
class TestReduceOnPlateauDecay(object):
def test_ReduceLR(self):
# the decay rate must be less than 1.0
with self.assertRaises(ValueError):
paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=2.0)
paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=2.0)
# the mode must be "min" or "max"
with self.assertRaises(ValueError):
paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, mode="test")
paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, mode="test")
# the threshold_mode must be "rel" or "abs"
with self.assertRaises(ValueError):
paddle.optimizer.ReduceLROnPlateau(
paddle.optimizer.lr.ReduceOnPlateau(
learning_rate=1.0, threshold_mode="test")
with self.assertRaises(TypeError):
paddle.optimizer.ReduceLROnPlateau(learning_rate="test")
paddle.optimizer.lr.ReduceOnPlateau(learning_rate="test")
with self.assertRaises(TypeError):
paddle.optimizer.ReduceLROnPlateau(learning_rate=0.5).step("test")
paddle.optimizer.lr.ReduceOnPlateau(learning_rate=0.5).step("test")
places = [paddle.CPUPlace()]
if core.is_compiled_with_cuda():
......@@ -114,7 +114,7 @@ class TestReduceLROnPlateauDecay(object):
[1], 1, 'float32', persistable=True)
paddle.increment(x)
loss = paddle.sin(x)
scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
scheduler = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
adam = paddle.optimizer.Adam(learning_rate=scheduler)
adam.minimize(loss)
lr_var = adam._global_learning_rate()
......@@ -158,7 +158,7 @@ class TestReduceLROnPlateauDecay(object):
var_list = [best, current_lr, cooldown_counter, num_bad_epochs]
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
scheduler = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
adam = paddle.optimizer.Adam(
learning_rate=scheduler, parameters=linear.parameters())
......@@ -180,7 +180,7 @@ class TestReduceLROnPlateauDecay(object):
loss, var_list)
self.assertEqual(current_lr, expected_lr)
state_dict = adam.state_dict()
scheduler1 = paddle.optimizer.ReduceLROnPlateau(**kwargs)
scheduler1 = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
adam1 = paddle.optimizer.Adam(
learning_rate=scheduler1, parameters=linear.parameters())
adam1.set_state_dict(state_dict)
......@@ -420,7 +420,7 @@ class TestLRScheduler(unittest.TestCase):
adam.clear_grad()
current_lr = adam.get_lr()
expected_lr = python_func(epoch, **kwarg)
if paddle_api.__name__ != "CosineAnnealingLR":
if paddle_api.__name__ != "CosineAnnealingDecay":
self.assertEqual(current_lr, expected_lr)
scheduler.step()
else:
......@@ -429,74 +429,75 @@ class TestLRScheduler(unittest.TestCase):
def test_scheduler(self):
with self.assertRaises(NotImplementedError):
paddle.optimizer.lr_scheduler._LRScheduler().step()
paddle.optimizer.lr.LRScheduler().step()
with self.assertRaises(TypeError):
paddle.optimizer.MultiStepLR(
paddle.optimizer.lr.MultiStepDecay(
learning_rate="test", milestones=[1, 2, 3])
with self.assertRaises(TypeError):
paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones='test')
paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones='test')
with self.assertRaises(ValueError):
paddle.optimizer.MultiStepLR(
paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones=[3, 2, 1])
with self.assertRaises(ValueError):
paddle.optimizer.MultiStepLR(
paddle.optimizer.lr.MultiStepDecay(
learning_rate=0.5, milestones=[1, 2, 3], gamma=2)
func_api_kwargs = [(noam_lr, paddle.optimizer.NoamLR, {
func_api_kwargs = [(noam_lr, paddle.optimizer.lr.NoamDecay, {
"d_model": 0.01,
"warmup_steps": 100,
"verbose": False
}), (piecewise_lr, paddle.optimizer.PiecewiseLR, {
}), (piecewise_lr, paddle.optimizer.lr.PiecewiseDecay, {
"boundaries": [3, 6, 9, 15, 20],
"values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
"verbose": False
}), (natural_exp_lr, paddle.optimizer.NaturalExpLR, {
}), (natural_exp_lr, paddle.optimizer.lr.NaturalExpDecay, {
"learning_rate": 0.5,
"gamma": 0.1,
"verbose": True
}), (inverse_time_lr, paddle.optimizer.InverseTimeLR, {
}), (inverse_time_lr, paddle.optimizer.lr.InverseTimeDecay, {
"learning_rate": 0.5,
"gamma": 0.1,
"verbose": False
}), (polynomial_lr, paddle.optimizer.PolynomialLR, {
}), (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, {
"learning_rate": 0.5,
"decay_steps": 20,
"end_lr": 0,
"power": 1.0,
"cycle": False,
"verbose": True
}), (polynomial_lr, paddle.optimizer.PolynomialLR, {
}), (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, {
"learning_rate": 0.5,
"decay_steps": 20,
"end_lr": 0,
"power": 1.0,
"cycle": True,
"verbose": False
}), (linear_warmup_lr, paddle.optimizer.LinearLrWarmup, {
}), (linear_warmup_lr, paddle.optimizer.lr.LinearWarmup, {
'learning_rate': 0.5,
'warmup_steps': 20,
'start_lr': 0,
'end_lr': 0.5,
"verbose": True
}), (exponential_lr, paddle.optimizer.ExponentialLR, {
}), (exponential_lr, paddle.optimizer.lr.ExponentialDecay, {
"learning_rate": 0.5,
"gamma": 0.9,
"verbose": False
}), (multi_step_lr, paddle.optimizer.MultiStepLR, {
}), (multi_step_lr, paddle.optimizer.lr.MultiStepDecay, {
"learning_rate": 0.5,
"milestones": [3, 6, 9, 15, 20],
"gamma": 0.8,
"verbose": True
}), (step_lr, paddle.optimizer.StepLR, {
}), (step_lr, paddle.optimizer.lr.StepDecay, {
"learning_rate": 0.5,
"step_size": 2,
"gamma": 0.8,
"verbose": False
}), (lambda_lr, paddle.optimizer.LambdaLR, {
}), (lambda_lr, paddle.optimizer.lr.LambdaDecay, {
"learning_rate": 0.5,
"lr_lambda": lambda x: 0.95**x,
"verbose": True
}), (cosine_annealing_lr, paddle.optimizer.CosineAnnealingLR, {
}), (cosine_annealing_lr, paddle.optimizer.lr.CosineAnnealingDecay, {
"learning_rate": 0.5,
"T_max": 10,
"verbose": False
......
......@@ -24,11 +24,6 @@ __all__ += [
'DataParallel'
]
__all__ += [
'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay'
]
from . import random
from .random import manual_seed
from .framework import get_default_dtype
......@@ -51,11 +46,3 @@ from ..fluid.dygraph.base import grad #DEFINE_ALIAS
from .io import save
from .io import load
from ..fluid.dygraph.parallel import DataParallel #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import NoamDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import PiecewiseDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import NaturalExpDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import ExponentialDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import InverseTimeDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import PolynomialDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import CosineDecay #DEFINE_ALIAS
......@@ -228,7 +228,7 @@ def save(obj, path):
layer_state_dict = emb.state_dict()
paddle.save(layer_state_dict, "emb.pdparams")
scheduler = paddle.optimizer.lr_scheduler.NoamLR(
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam(
learning_rate=scheduler,
......@@ -320,7 +320,7 @@ def load(path, **configs):
layer_state_dict = emb.state_dict()
paddle.save(layer_state_dict, "emb.pdparams")
scheduler = paddle.optimizer.lr_scheduler.NoamLR(
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam(
learning_rate=scheduler,
......
......@@ -121,13 +121,6 @@ from .layer.conv import ConvTranspose3d #DEFINE_ALIAS
# from .layer.conv import TreeConv #DEFINE_ALIAS
# from .layer.conv import Conv1D #DEFINE_ALIAS
from .layer.extension import RowConv #DEFINE_ALIAS
# from .layer.learning_rate import CosineDecay #DEFINE_ALIAS
# from .layer.learning_rate import ExponentialDecay #DEFINE_ALIAS
# from .layer.learning_rate import InverseTimeDecay #DEFINE_ALIAS
# from .layer.learning_rate import NaturalExpDecay #DEFINE_ALIAS
# from .layer.learning_rate import NoamDecay #DEFINE_ALIAS
# from .layer.learning_rate import PiecewiseDecay #DEFINE_ALIAS
# from .layer.learning_rate import PolynomialDecay #DEFINE_ALIAS
from .layer.common import Linear
# from .layer.loss import NCELoss #DEFINE_ALIAS
from .layer.loss import BCEWithLogitsLoss #DEFINE_ALIAS
......
......@@ -95,14 +95,6 @@ from .extension import target_assign #DEFINE_ALIAS
from .extension import temporal_shift #DEFINE_ALIAS
from .extension import warpctc #DEFINE_ALIAS
from .extension import diag_embed #DEFINE_ALIAS
from .learning_rate import cosine_decay #DEFINE_ALIAS
from .learning_rate import exponential_decay #DEFINE_ALIAS
from .learning_rate import inverse_time_decay #DEFINE_ALIAS
from .learning_rate import natural_exp_decay #DEFINE_ALIAS
from .learning_rate import noam_decay #DEFINE_ALIAS
from .learning_rate import piecewise_decay #DEFINE_ALIAS
from .learning_rate import polynomial_decay #DEFINE_ALIAS
from .learning_rate import linear_lr_warmup #DEFINE_ALIAS
# from .lod import sequence_concat #DEFINE_ALIAS
# from .lod import sequence_conv #DEFINE_ALIAS
# from .lod import sequence_enumerate #DEFINE_ALIAS
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: define learning rate decay
from ...fluid.layers import cosine_decay #DEFINE_ALIAS
from ...fluid.layers import exponential_decay #DEFINE_ALIAS
from ...fluid.layers import inverse_time_decay #DEFINE_ALIAS
from ...fluid.layers import natural_exp_decay #DEFINE_ALIAS
from ...fluid.layers import noam_decay #DEFINE_ALIAS
from ...fluid.layers import piecewise_decay #DEFINE_ALIAS
from ...fluid.layers import polynomial_decay #DEFINE_ALIAS
from ...fluid.layers import linear_lr_warmup #DEFINE_ALIAS
__all__ = [
'cosine_decay', 'exponential_decay', 'inverse_time_decay',
'natural_exp_decay', 'noam_decay', 'piecewise_decay', 'polynomial_decay',
'linear_lr_warmup'
]
......@@ -86,13 +86,6 @@ from .conv import ConvTranspose3d #DEFINE_ALIAS
# from .conv import TreeConv #DEFINE_ALIAS
# from .conv import Conv1D #DEFINE_ALIAS
from .extension import RowConv #DEFINE_ALIAS
# from .learning_rate import CosineDecay #DEFINE_ALIAS
# from .learning_rate import ExponentialDecay #DEFINE_ALIAS
# from .learning_rate import InverseTimeDecay #DEFINE_ALIAS
# from .learning_rate import NaturalExpDecay #DEFINE_ALIAS
# from .learning_rate import NoamDecay #DEFINE_ALIAS
# from .learning_rate import PiecewiseDecay #DEFINE_ALIAS
# from .learning_rate import PolynomialDecay #DEFINE_ALIAS
# from .loss import NCELoss #DEFINE_ALIAS
from .loss import BCEWithLogitsLoss #DEFINE_ALIAS
from .loss import CrossEntropyLoss #DEFINE_ALIAS
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: define learning rate decay
__all__ = [
# 'CosineDecay',
# 'ExponentialDecay',
# 'InverseTimeDecay',
# 'NaturalExpDecay',
# 'NoamDecay',
# 'PiecewiseDecay',
# 'PolynomialDecay'
]
......@@ -16,10 +16,7 @@ __all__ = [
'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam',
'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer', 'Dpsgd',
'DpsgdOptimizer', 'Ftrl', 'FtrlOptimizer', 'Momentum', 'MomentumOptimizer',
'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer', '_LRScheduler', 'NoamLR',
'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR',
'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR',
'ReduceLROnPlateau', 'CosineAnnealingLR'
'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer'
]
......@@ -36,6 +33,4 @@ from .adadelta import Adadelta
from .sgd import SGD
from .momentum import Momentum
from . import lr_scheduler
from .lr_scheduler import _LRScheduler, NoamLR, PiecewiseLR, NaturalExpLR, InverseTimeLR, PolynomialLR, \
LinearLrWarmup, ExponentialLR, MultiStepLR, StepLR, LambdaLR, ReduceLROnPlateau, CosineAnnealingLR
from . import lr
......@@ -48,8 +48,8 @@ class Adam(Optimizer):
Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
Args:
learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler. The default value is 0.001.
learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LRScheduler. The default value is 0.001.
beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 0.9.
......
......@@ -47,8 +47,8 @@ class Adamax(Optimizer):
it is added here for numerical stability to prevent the division by 0 error.
Args:
learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler. The default value is 0.001.
learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LRScheduler. The default value is 0.001.
beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
The default value is 0.9.
beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
......
......@@ -42,8 +42,8 @@ class AdamW(Adam):
Args:
learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler. The default value is 0.001.
learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LRScheduler. The default value is 0.001.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
......
......@@ -41,7 +41,7 @@ from paddle.fluid.layers import tensor
from functools import reduce
from ..fluid.wrapped_decorator import signature_safe_contextmanager
from .. import compat as cpt
from .lr_scheduler import _LRScheduler
from .lr import LRScheduler
__all__ = ['Optimizer']
......@@ -54,8 +54,8 @@ class Optimizer(object):
but need to use one of it's implementation.
Args:
learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``.
It can be a float value or any subclass of ``_LRScheduler`` .
learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
It can be a float value or any subclass of ``LRScheduler`` .
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
......@@ -82,12 +82,8 @@ class Optimizer(object):
#Take the subclass adam as an example
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear(inp)
loss = paddle.mean(out)
adam = paddle.optimizer.Adam(learning_rate=0.1,
......@@ -121,9 +117,9 @@ class Optimizer(object):
"The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
% weight_decay.__str__())
break
if not isinstance(learning_rate, (float, _LRScheduler)):
if not isinstance(learning_rate, (float, LRScheduler)):
raise TypeError(
"learning rate should be float or _LRScheduler, got %s here" %
"learning rate should be float or LRScheduler, got %s here" %
type(learning_rate))
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
......@@ -156,7 +152,7 @@ class Optimizer(object):
@framework.dygraph_only
def state_dict(self):
'''
Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be include in state dict.
Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be include in state dict.
If the optimizer never be called(minimize function), the state_dict is empty.
Args:
......@@ -169,7 +165,6 @@ class Optimizer(object):
.. code-block:: python
import paddle
paddle.disable_static()
emb = paddle.nn.Embedding(10, 10)
adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
......@@ -181,14 +176,14 @@ class Optimizer(object):
for para_name, var_tmp in v.items():
state_dict[var_tmp.name] = var_tmp
# global step if use lr decay
if isinstance(self._learning_rate, _LRScheduler):
if isinstance(self._learning_rate, LRScheduler):
state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
return state_dict
@framework.dygraph_only
def set_state_dict(self, state_dict):
'''
Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be changed.
Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be changed.
Args:
state_dict(dict) : Dict contains all the Tensor needed by optimizer
......@@ -199,26 +194,28 @@ class Optimizer(object):
.. code-block:: python
import paddle
paddle.disable_static()
emb = paddle.nn.Embedding(10, 10)
state_dict = emb.state_dict()
paddle.framework.save(state_dict, "paddle_dy")
emb = paddle.nn.Embedding(10, 10)
adam = paddle.optimizer.Adam(learning_rate=paddle.optimizer.NoamLR( 100, 10000),
parameters=emb.parameters())
state_dict = adam.state_dict()
paddle.framework.save(state_dict, "paddle_dy")
layer_state_dict = emb.state_dict()
paddle.save(layer_state_dict, "emb.pdparams")
para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam(
learning_rate=scheduler,
parameters=emb.parameters())
opt_state_dict = adam.state_dict()
paddle.save(opt_state_dict, "adam.pdopt")
opti_state_dict = paddle.load("adam.pdopt")
adam.set_state_dict(opti_state_dict)
'''
if isinstance(self._learning_rate, _LRScheduler):
if isinstance(self._learning_rate, LRScheduler):
self._learning_rate.set_dict(state_dict["LR_Scheduler"])
if isinstance(self._learning_rate, _LRScheduler):
if isinstance(self._learning_rate, LRScheduler):
self._learning_rate.set_state_dict(state_dict["LR_Scheduler"])
self._accumulators_holder = state_dict
......@@ -256,7 +253,7 @@ class Optimizer(object):
return self._opti_name_list
def _create_global_learning_rate(self):
if isinstance(self._learning_rate, _LRScheduler):
if isinstance(self._learning_rate, LRScheduler):
lr_var = self._global_learning_rate()
# only create global lr_var once
if not isinstance(lr_var, framework.Variable):
......@@ -299,7 +296,7 @@ class Optimizer(object):
"""
:api_attr: imperative
Set the value of the learning rate manually in the optimizer. If the optimizer use _LRScheduler,
Set the value of the learning rate manually in the optimizer. If the optimizer use LRScheduler,
this API cannot be invoked, because it will lead to conflict.
Args:
......@@ -312,7 +309,6 @@ class Optimizer(object):
.. code-block:: python
import paddle
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
......@@ -335,9 +331,9 @@ class Optimizer(object):
raise TypeError(
"The type of 'value' in optimizer.set_lr must be float, but received %s."
% (type(value)))
if isinstance(self._learning_rate, _LRScheduler):
if isinstance(self._learning_rate, LRScheduler):
raise RuntimeError(
"optimizer's learning rate can't be _LRScheduler when invoke this API, because this will lead to conflict."
"optimizer's learning rate can't be LRScheduler when invoke this API, because this will lead to conflict."
)
self._learning_rate = float(value)
current_lr = self._global_learning_rate()
......@@ -358,7 +354,7 @@ class Optimizer(object):
"""
:api_attr: imperative
Get current step learning rate. The return value is all the same When _LRScheduler is not used,
Get current step learning rate. The return value is all the same When LRScheduler is not used,
otherwise return the current step learning rate.
......@@ -370,15 +366,13 @@ class Optimizer(object):
import numpy as np
import paddle
# example1: _LRScheduler is not used, return value is all the same
paddle.disable_static()
# example1: LRScheduler is not used, return value is all the same
emb = paddle.nn.Embedding(10, 10)
adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters())
lr = adam.get_lr()
print(lr) # 0.001
# example2: PiecewiseLR is used, return the step learning rate
paddle.disable_static()
# example2: PiecewiseDecay is used, return the scheduled learning rate
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
......@@ -387,7 +381,7 @@ class Optimizer(object):
bd = [2, 4, 6, 8]
value = [0.2, 0.4, 0.6, 0.8, 1.0]
scheduler = paddle.optimizer.PiecewiseLR(bd, value, 0)
scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value, 0)
adam = paddle.optimizer.Adam(scheduler,
parameters=linear.parameters())
......@@ -656,7 +650,6 @@ class Optimizer(object):
import paddle
import numpy as np
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
......@@ -727,7 +720,6 @@ class Optimizer(object):
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
......@@ -805,7 +797,7 @@ class Optimizer(object):
import numpy as np
import paddle
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
......@@ -854,13 +846,9 @@ class Optimizer(object):
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
input = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear(input)
loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
......@@ -903,7 +891,7 @@ class Optimizer(object):
import paddle
import numpy as np
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
......
......@@ -69,8 +69,8 @@ class RMSProp(Optimizer):
Parameters:
learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``.
It can be a float value or a _LRScheduler.
learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
It can be a float value or a LRScheduler.
rho(float): rho is :math: `\\rho` in equation, default is 0.95.
epsilon(float): :math: `\\epsilon` in equation is smoothing term to
avoid division by zero, default is 1e-6.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册