未验证 提交 914ff10a 编写于 作者: Z Zhou Wei 提交者: GitHub

fix state dict to save/load learning rate scheduler (#25403)

* fix optimizer.state_dict and LRScheduler.state_dict to save/load dygraph,test=develop

* fix optimizer.state_dict and LRScheduler.state_dict to save/load dygraph,test=develop

* Add a judgment that state_dict/set_dict is used incorrectly,test=develop

* fix some doc error,test=develop

* fix current_step_lr for _LearningRateEpochDecay,test=develop

* remove some unsed code to improve coverage,test=develop

* remove some unsed code to improve coverage,test=develop
上级 fc93266b
......@@ -80,9 +80,9 @@ def save_dygraph(state_dict, model_path):
for k, v in state_dict.items():
if isinstance(v, (Variable, core.VarBase)):
model_dict[k] = v.numpy()
name_table[k] = v.name
else:
model_dict[k] = v
name_table[k] = v.name
model_dict["StructuredToParameterName@@"] = name_table
file_name = model_path + suffix
......
......@@ -15,6 +15,7 @@
from __future__ import print_function
import math
import warnings
from .. import unique_name
from ..framework import Variable
......@@ -66,6 +67,51 @@ class LearningRateDecay(object):
persistable=False)
return lr
def state_dict(self):
"""
Returns the state of the scheduler as a :class:`dict`.
It is a subset of self.__dict__ .
"""
self._state_keys()
state_dict = {}
for key in self.keys:
if key not in self.__dict__:
continue
value = self.__dict__[key]
if isinstance(value, Variable):
assert value.shape == [
1
], "shape of Variable in state_dict must be [1] {}".format(
value.shape)
value = value.numpy()[0]
state_dict[key] = value
return state_dict
def _state_keys(self):
"""
set the keys in self.__dict__ that are needed to be saved.
"""
self.keys = ['step_num']
def set_dict(self, state_dict):
"""
Loads the schedulers state.
"""
self._state_keys()
for key in self.keys:
if key in state_dict:
self.__dict__[key] = state_dict[key]
else:
raise RuntimeError(
"Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".
format(key))
if len(state_dict) > len(self.keys):
warnings.warn(
"There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
)
def step(self):
raise NotImplementedError()
......@@ -402,7 +448,7 @@ class PolynomialDecay(LearningRateDecay):
learning_rate(Variable|float): The initial learning rate. If the type
is Variable, it's a tensor with shape [1], the data type can be
float32 or float64. It also can be set to python int number.
decay_steps(int32): The decay step size. It determines the decay cycle.
decay_steps(int): The decay step size. It determines the decay cycle.
end_learning_rate(float, optional): The minimum final learning rate. The default value is 0.0001.
power(float, optional): Power of polynomial. The default value is 1.0.
cycle(bool, optional): If set true, decay the learning rate every decay_steps. The default value is False.
......@@ -785,7 +831,7 @@ class ReduceLROnPlateau(LearningRateDecay):
raise ValueError(
'new_lr = origin_lr * decay_rate and decay_rate should be < 1.0.'
)
self.decay_rate = decay_rate
self.decay_rate = self.create_lr_var(decay_rate)
threshold_mode = threshold_mode.lower()
if threshold_mode not in ['rel', 'abs']:
......@@ -794,8 +840,10 @@ class ReduceLROnPlateau(LearningRateDecay):
self.threshold_mode = threshold_mode
check_type(learning_rate, 'learning_rate', (float, int, Variable),
'ReduceLROnPlateau')
if isinstance(learning_rate, (float, int)):
learning_rate = self.create_lr_var(learning_rate)
if not isinstance(learning_rate, (float, int, Variable)):
raise TypeError(
"The type of 'learning_rate' in 'ReduceLROnPlateau' must be 'float, int, Variable', but received %s."
% type(learning_rate))
self.learning_rate = learning_rate
self.verbose = verbose
......@@ -809,9 +857,17 @@ class ReduceLROnPlateau(LearningRateDecay):
self.cooldown_counter = 0
self.best_loss = None
self.num_bad_epochs = 0
self.epoch = 0
self.epoch_num = 0
def _state_keys(self):
self.keys = [
'cooldown_counter', 'best_loss', 'num_bad_epochs', 'epoch_num',
'learning_rate'
]
def __call__(self):
if not isinstance(self.learning_rate, Variable):
self.learning_rate = self.create_lr_var(self.learning_rate)
return self.learning_rate
def step(self, loss):
......@@ -837,7 +893,7 @@ class ReduceLROnPlateau(LearningRateDecay):
"should be (1L,), but the current loss.shape is {}. Maybe that " \
"you should call fluid.layers.mean to process it first.".format(loss.shape)
self.epoch += 1
self.epoch_num += 1
if self.cooldown_counter > 0:
self.cooldown_counter -= 1
else:
......@@ -855,10 +911,11 @@ class ReduceLROnPlateau(LearningRateDecay):
self.decay_rate, self.min_lr)
if self.learning_rate - new_lr > self.eps:
if self.verbose:
old_lr = self.learning_rate.numpy()[0] if isinstance(
self.learning_rate,
Variable) else self.learning_rate
print('Epoch {}: reducing learning rate from {} to {}.'.
format(self.epoch,
self.learning_rate.numpy()[0],
new_lr.numpy()[0]))
format(self.epoch_num, old_lr, new_lr.numpy()[0]))
self.learning_rate = new_lr
def _is_better(self, current, best):
......@@ -891,22 +948,28 @@ class _LearningRateEpochDecay(LearningRateDecay):
raise TypeError(
"The type of 'learning_rate' must be 'float, int', but received %s."
% type(learning_rate))
if learning_rate >= 1.0:
raise ValueError("The initial learning rate")
if learning_rate < 0:
raise ValueError("Invalid learning rate: {}".format(learning_rate))
self.base_lr = float(learning_rate)
self.epoch_num = -1
self.dtype = dtype
if dtype is None:
self.dtype = "float32"
self.learning_rate = self.create_lr_var(self.base_lr)
self.epoch()
def _state_keys(self):
self.keys = ['epoch_num', 'learning_rate']
def __call__(self):
"""
Return last computed learning rate on current epoch.
"""
if not isinstance(self.learning_rate, Variable):
self.learning_rate = self.create_lr_var(self.learning_rate)
return self.learning_rate
def epoch(self, epoch=None):
......@@ -919,8 +982,6 @@ class _LearningRateEpochDecay(LearningRateDecay):
self.epoch_num = epoch
self.learning_rate = self.get_lr()
if isinstance(self.learning_rate, float):
self.learning_rate = self.create_lr_var(self.learning_rate)
def get_lr(self):
raise NotImplementedError
......@@ -947,7 +1008,7 @@ class StepDecay(_LearningRateEpochDecay):
Parameters:
learning_rate (float|int): The initial learning rate. It can be set to python float or int number.
step_size (int): Period of learning rate decay..
step_size (int): Period of learning rate decay.
decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` .
It should be less than 1.0. Default: 0.1.
......@@ -1025,7 +1086,7 @@ class MultiStepDecay(_LearningRateEpochDecay):
learning_rate = 0.005
Parameters:
learning_rate (float|int): The initial learning rate. It can be set to python float or int number. If it
learning_rate (float|int): The initial learning rate. It can be set to python float or int number.
milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` .
It should be less than 1.0. Default: 0.1.
......
......@@ -33,7 +33,7 @@ from .layers import ops
from .regularizer import append_regularization_ops
from .dygraph import base as imperative_base
from .dygraph import no_grad
from .dygraph.learning_rate_scheduler import LearningRateDecay
from .dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay
from paddle.fluid import core
from paddle.fluid.layers import tensor
from functools import reduce
......@@ -149,12 +149,12 @@ class Optimizer(object):
state_dict[var_tmp.name] = var_tmp
# global step if use lr decay
if isinstance(self._learning_rate, LearningRateDecay):
state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
if not isinstance(self._learning_rate, _LearningRateEpochDecay):
var_tmp = None
if framework.in_dygraph_mode():
var_temp = framework._varbase_creator(
None, name='global_step', dtype='int32')
else:
var_temp = Variable(None, name='global_step', dtype='int32')
tensor.fill_constant(
[1], "int32", self._learning_rate.step_num, out=var_temp)
......@@ -193,22 +193,20 @@ class Optimizer(object):
'''
if isinstance(self._learning_rate, LearningRateDecay):
self._learning_rate.set_dict(state_dict["LR_Scheduler"])
if not isinstance(self._learning_rate, _LearningRateEpochDecay):
assert 'global_step' in state_dict, \
'Global step not in state dict, Dygraph use LearningRateDecay, global_step must in state_dict'
global_step = state_dict['global_step']
if isinstance(global_step, core.VarBase):
if isinstance(global_step, Variable):
step_np = global_step
step_np = np.array(step_np.value().get_tensor())
assert step_np.shape == (1,), \
"global step shape is (1,), the shape is {}".format( step_np.shape )
self._learning_rate.step_num = int(step_np[0])
elif isinstance(global_step, Variable):
step_np = global_step.numpy()
assert step_np.shape == (1,), \
"global step shape is (1,), the shape is {}".format( step_np.shape )
self._learning_rate.step_num = step_np[0]
elif isinstance(global_step, np.ndarray):
assert global_step.shape == (1,), \
"global step shape is (1,), the shape is {}".format( global_step.shape )
......@@ -423,11 +421,14 @@ class Optimizer(object):
"""
current_lr = self._global_learning_rate()
if current_lr:
if isinstance(current_lr, framework.Variable):
return self._global_learning_rate().numpy()[0]
if isinstance(self._learning_rate, float):
return self._learning_rate
elif isinstance(self._learning_rate, _LearningRateEpochDecay):
step_lr = self._learning_rate()
return step_lr.numpy()[0]
else:
step_lr = self._learning_rate.step()
if isinstance(step_lr, (float, int)):
......
......@@ -277,8 +277,11 @@ class TestDygraphPtbRnn(unittest.TestCase):
self.opti_dict = adam.state_dict()
self.base_opti = {}
for k, v in self.opti_dict.items():
if isinstance(v, core.VarBase):
self.base_opti[v.name] = v.numpy()
self.assertTrue(np.sum(np.abs(v.numpy())) != 0)
else:
self.base_opti[k] = v
fluid.save_dygraph(self.opti_dict, "./test_dy")
......@@ -360,6 +363,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
opti_dict = adam.state_dict()
# set to zero
for k, v in opti_dict.items():
if isinstance(v, core.VarBase):
np_t = v.numpy()
var = v.value().get_tensor()
var.set(np.zeros_like(np_t), place)
......@@ -375,8 +379,11 @@ class TestDygraphPtbRnn(unittest.TestCase):
opti_dict = adam.state_dict()
for k, v in opti_dict.items():
if isinstance(v, core.VarBase):
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name]))
else:
self.assertEqual(v, self.base_opti[k])
# check parameter
state_dict = ptb_model.state_dict()
......@@ -466,6 +473,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
opti_dict = adam.state_dict()
# set to zero
for k, v in opti_dict.items():
if isinstance(v, core.VarBase):
np_t = v.numpy()
var = v.value().get_tensor()
var.set(np.zeros_like(np_t), place)
......@@ -476,11 +484,13 @@ class TestDygraphPtbRnn(unittest.TestCase):
adam._learning_rate.step_num = 0
adam.set_dict(self.opti_dict)
opti_dict = adam.state_dict()
for k, v in opti_dict.items():
if isinstance(v, core.VarBase):
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name]))
else:
self.assertEqual(v, self.base_opti[k])
# check parameter
state_dict = ptb_model.state_dict()
......@@ -571,12 +581,14 @@ class TestDygraphPtbRnn(unittest.TestCase):
np_opti_dict = {}
# set to zero
for k, v in opti_dict.items():
if isinstance(v, core.VarBase):
np_t = v.numpy()
np_opti_dict[v.name] = np_t
var = v.value().get_tensor()
var.set(np.zeros_like(np_t), place)
self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
else:
np_opti_dict[k] = v
if isinstance(adam._learning_rate, LearningRateDecay):
adam._learning_rate.step_num = 0
......@@ -585,8 +597,11 @@ class TestDygraphPtbRnn(unittest.TestCase):
opti_dict = adam.state_dict()
for k, v in opti_dict.items():
if isinstance(v, core.VarBase):
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name]))
else:
self.assertEqual(v, self.base_opti[k])
# check parameter
state_dict = ptb_model.state_dict()
......@@ -827,7 +842,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
np_state_dict = {}
for k, v in self.opti_dict.items():
if isinstance(v, core.VarBase):
np_opti_dict[v.name] = v.numpy()
else:
np_opti_dict[k] = v
for k, v in self.state_dict.items():
np_state_dict[k] = v.numpy()
......
......@@ -121,6 +121,104 @@ def lambda_decay(global_step, learning_rate, lr_lambda):
class TestLearningRateDecayDygraph(unittest.TestCase):
def test_LR_state_dict(self):
with fluid.dygraph.guard():
x = np.random.uniform(-1, 1, [3, 10]).astype("float32")
linear = fluid.dygraph.Linear(10, 10)
input = fluid.dygraph.to_variable(x)
Exponential_scheduler = fluid.dygraph.ExponentialDecay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True)
Step_scheduler = fluid.dygraph.StepDecay(0.5, step_size=3)
Reducelr_scheduler = fluid.dygraph.ReduceLROnPlateau(
learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3)
adam1 = fluid.optimizer.Adam(
learning_rate=Exponential_scheduler,
parameter_list=linear.parameters())
adam2 = fluid.optimizer.Adam(
learning_rate=Step_scheduler,
parameter_list=linear.parameters())
adam3 = fluid.optimizer.Adam(
learning_rate=Reducelr_scheduler,
parameter_list=linear.parameters())
print(adam3.state_dict())
for epoch in range(10):
out = linear(input)
loss = fluid.layers.reduce_mean(out)
loss.backward()
adam1.minimize(loss)
adam2.minimize(loss)
adam3.minimize(loss)
linear.clear_gradients()
Step_scheduler.epoch()
Reducelr_scheduler.step(loss)
fluid.dygraph.save_dygraph(linear.state_dict(), "save_path")
Exponential_scheduler_test = fluid.dygraph.ExponentialDecay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True)
Step_scheduler_test = fluid.dygraph.StepDecay(0.5, step_size=3)
Reducelr_scheduler_test = fluid.dygraph.ReduceLROnPlateau(
learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3)
fluid.dygraph.save_dygraph(adam1.state_dict(), "save_path")
_, opt_state = fluid.dygraph.load_dygraph("save_path")
adam_test = fluid.optimizer.Adam(
learning_rate=Exponential_scheduler_test,
parameter_list=linear.parameters())
adam_test.set_dict(opt_state)
self.assertEqual(adam_test._learning_rate.step_num,
adam1._learning_rate.step_num,
"epoch_num is different before and after set_dict")
fluid.dygraph.save_dygraph(adam2.state_dict(), "save_path")
_, opt_state = fluid.dygraph.load_dygraph("save_path")
adam_test = fluid.optimizer.Adam(
learning_rate=Step_scheduler_test,
parameter_list=linear.parameters())
adam_test.set_dict(opt_state)
self.assertEqual(adam_test._learning_rate.epoch_num,
adam2._learning_rate.epoch_num,
"epoch_num is different before and after set_dict")
self.assertEqual(
adam_test._learning_rate(),
adam2._learning_rate(),
"current learning rate is different before and after set_dict")
fluid.dygraph.save_dygraph(adam3.state_dict(), "save_path")
_, opt_state = fluid.dygraph.load_dygraph("save_path")
adam_test = fluid.optimizer.Adam(
learning_rate=Reducelr_scheduler_test,
parameter_list=linear.parameters())
adam_test.set_dict(opt_state)
self.assertEqual(adam_test._learning_rate.best_loss,
adam3._learning_rate.best_loss.numpy()[0],
"best_loss is different before and after set_dict")
self.assertEqual(
adam_test._learning_rate.cooldown_counter,
adam3._learning_rate.cooldown_counter,
"cooldown_counter is different before and after set_dict")
self.assertEqual(
adam_test._learning_rate.num_bad_epochs,
adam3._learning_rate.num_bad_epochs,
"num_bad_epochs is different before and after set_dict")
self.assertEqual(adam_test._learning_rate.epoch_num,
adam3._learning_rate.epoch_num,
"epoch is different before and after set_dict")
self.assertEqual(
adam_test._learning_rate(),
adam3._learning_rate(),
"current learning rate is different before and after set_dict")
def test_NoamDecay(self):
with fluid.dygraph.guard():
d_model = 0.01
......@@ -169,17 +267,22 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
learning_rate = 0.5
milestones = [2, 4, 8]
decay_rate = 0.2
linear = fluid.dygraph.Linear(10, 10)
scheduler = fluid.dygraph.MultiStepDecay(learning_rate, milestones,
decay_rate)
adam = fluid.optimizer.AdamOptimizer(
learning_rate=scheduler, parameter_list=linear.parameters())
for epoch in range(10):
right_result = multi_step_decay(epoch, learning_rate,
milestones, decay_rate)
fluid_result = scheduler().numpy()[0]
fluid_result = adam.current_step_lr()
scheduler.epoch()
self.assertAlmostEqual(
right_result,
fluid_result,
msg='Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'.
msg='Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.
format(epoch, right_result, fluid_result))
with self.assertRaises(ValueError):
......@@ -194,7 +297,7 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
lr = fluid.dygraph.MultiStepDecay("test", [20, 30, 50])
with self.assertRaises(ValueError):
lr = fluid.dygraph.MultiStepDecay(2.0, [20, 30, 50])
lr = fluid.dygraph.MultiStepDecay(-1, [20, 30, 50])
def test_StepDecay(self):
with fluid.dygraph.guard():
......@@ -211,15 +314,14 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
self.assertAlmostEqual(
right_result,
fluid_result,
msg='Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'.
msg='Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.
format(epoch, right_result, fluid_result))
with self.assertRaises(TypeError):
lr = fluid.dygraph.MultiStepDecay(learning_rate, "test", 0.1)
lr = fluid.dygraph.StepDecay(learning_rate, "test", 0.1)
with self.assertRaises(ValueError):
lr = fluid.dygraph.MultiStepDecay(learning_rate, [20, 30, 50],
1)
lr = fluid.dygraph.StepDecay(learning_rate, 20, 2)
def test_LambdaDecay(self):
with fluid.dygraph.guard():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
新手
引导
客服 返回
顶部