未验证 提交 022dfed4 编写于 作者: J Jiabin Yang 提交者: GitHub

Add optimizer save and load (#16986)

* save optimizer related vars in dygraph

* test=develop, add optimizer save and load

* test=develop, add optimizer save and load

* test=develop, merge code and add multi-optimizer save and load

* test=develop, fix test_imperative_checkpoint

* test=develop, fix include error

* test=develop, fix include error

* test=develop, renew api spec

* test=develop, refine code

* test=develop, set default value for checkpoint

* test=develop, fix ci error

* test=develop, change API.spec and make api more readable

* test=develop, refine version and time stamp

* test=develop, add example code and refine code

* test=develop, refine doc

* test=develop, change version
上级 453a49b1
此差异已折叠。
...@@ -92,3 +92,6 @@ def to_variable(value, block=None, name=None): ...@@ -92,3 +92,6 @@ def to_variable(value, block=None, name=None):
return py_var return py_var
elif isinstance(value, framework.Variable): elif isinstance(value, framework.Variable):
return value return value
else:
raise TypeError(
"to_variable only accepts 'ndarray' and 'Variable' as value's input")
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -16,13 +16,18 @@ from __future__ import print_function ...@@ -16,13 +16,18 @@ from __future__ import print_function
import os import os
import collections import collections
from .. import core
from ..framework import Variable, default_main_program from ..framework import Variable, default_main_program
import pickle
from . import learning_rate_scheduler
import warnings
__all__ = ['save_persistables', 'load_persistables'] __all__ = ['save_persistables', 'load_persistables']
def save_persistables(vardict, dirname, filename=None): def save_persistables(model_dict,
optimizer=None,
dirname='save_dir',
filename=None):
""" """
This function filters out all variables in layer.parameters from the This function filters out all variables in layer.parameters from the
give `layer` and then trys to load these variables from the folder give `layer` and then trys to load these variables from the folder
...@@ -34,12 +39,12 @@ def save_persistables(vardict, dirname, filename=None): ...@@ -34,12 +39,12 @@ def save_persistables(vardict, dirname, filename=None):
the file name. the file name.
Args: Args:
vardict(dict of Parameters): The parameters will model_dict(dict of Parameters): The parameters will
be saved. If it is None, nothing be saved. If it is None, nothing
will be deal. will be deal.
dirname(str): The directory path. dirname(str): The directory path.
filename(str|None): The file which saved all variables. If variables were filename(str|None): The file which saved all variables. If variables were
saved in differnet files, set it to None. saved in different files, set it to None.
Default: None Default: None
Returns: Returns:
...@@ -71,11 +76,11 @@ def save_persistables(vardict, dirname, filename=None): ...@@ -71,11 +76,11 @@ def save_persistables(vardict, dirname, filename=None):
fluid.dygraph.save_persistables(ptb_model.state_dict(), dirname=param_path, fluid.dygraph.save_persistables(ptb_model.state_dict(), dirname=param_path,
layer=ptb_model) layer=ptb_model)
""" """
if isinstance(vardict, collections.OrderedDict): if isinstance(model_dict, collections.OrderedDict):
_save_var_to_file(vardict, dirname, filename) _save_var_to_file(model_dict, optimizer, dirname, filename)
def load_persistables(dirname): def load_persistables(dirname='save_dir'):
""" """
This function trys to load persistable variables from the folder This function trys to load persistable variables from the folder
`dirname` or the file `filename`. `dirname` or the file `filename`.
...@@ -86,7 +91,8 @@ def load_persistables(dirname): ...@@ -86,7 +91,8 @@ def load_persistables(dirname):
the file name. the file name.
Args: Args:
dirname(str): The directory path. dirname(str): The directory path. default is save_dir
optimizer(Optimizer): Optimizer to be saved
Returns: Returns:
dict: The parameter-dict resumed from file dict: The parameter-dict resumed from file
...@@ -103,7 +109,7 @@ def load_persistables(dirname): ...@@ -103,7 +109,7 @@ def load_persistables(dirname):
return _load_var_from_file(dirname) return _load_var_from_file(dirname)
def _save_var_to_file(stat_dict, file_dir, file_name): def _save_var_to_file(stat_dict, optimizers, file_dir, file_name):
save_block = default_main_program().global_block() save_block = default_main_program().global_block()
save_var_map = {} save_var_map = {}
for var_key, each_var in stat_dict.items(): for var_key, each_var in stat_dict.items():
...@@ -117,6 +123,32 @@ def _save_var_to_file(stat_dict, file_dir, file_name): ...@@ -117,6 +123,32 @@ def _save_var_to_file(stat_dict, file_dir, file_name):
'file_path': os.path.join(file_dir, 'file_path': os.path.join(file_dir,
os.path.normpath(each_var.name)) os.path.normpath(each_var.name))
}) })
if isinstance(optimizers, (list, tuple)):
optimizers = optimizers
else:
optimizers = [optimizers]
if os.path.exists(os.path.join(file_dir, os.path.normpath("optimizers"))):
pass
else:
os.mkdir(os.path.join(file_dir, os.path.normpath("optimizers")))
for optimizer in optimizers:
if isinstance(optimizer._learning_rate,
learning_rate_scheduler.LearningRateDecay):
try:
f = open(
os.path.join(file_dir, "optimizers",
os.path.normpath(str(optimizer._name))), "wb")
pickle.dump(optimizer._learning_rate, f, 2)
f.close()
except ():
raise IOError("Can't load %s",
os.path.join(
file_dir, "optimizers",
os.path.normpath(str(optimizer._name))))
else:
warnings.warn(
"Optimizer not saved, Only optimizer with 'LearningRateDecay' under DyGraph mode need to be saved"
)
if file_name is not None: if file_name is not None:
save_var_list = [] save_var_list = []
...@@ -138,6 +170,8 @@ def _load_var_from_file(file_dir): ...@@ -138,6 +170,8 @@ def _load_var_from_file(file_dir):
var_name_list = [] var_name_list = []
if os.path.exists(base_path): if os.path.exists(base_path):
for dirpath, dirnames, filenames in os.walk(base_path): for dirpath, dirnames, filenames in os.walk(base_path):
if "optimizers" in dirpath:
continue
pt = dirpath.replace(base_path, "", 1) pt = dirpath.replace(base_path, "", 1)
if pt.startswith("/") or pt.startswith("\\"): if pt.startswith("/") or pt.startswith("\\"):
pt = pt[1:] pt = pt[1:]
...@@ -152,6 +186,7 @@ def _load_var_from_file(file_dir): ...@@ -152,6 +186,7 @@ def _load_var_from_file(file_dir):
load_block = default_main_program().global_block() load_block = default_main_program().global_block()
load_var_map = {} load_var_map = {}
load_optimizer_map = {}
file_var_list = walk_filename(file_dir) file_var_list = walk_filename(file_dir)
for var_name in file_var_list: for var_name in file_var_list:
new_var = Variable(block=load_block, name=var_name) new_var = Variable(block=load_block, name=var_name)
...@@ -165,8 +200,22 @@ def _load_var_from_file(file_dir): ...@@ -165,8 +200,22 @@ def _load_var_from_file(file_dir):
}) })
load_var_map[new_var.name] = new_var load_var_map[new_var.name] = new_var
opt_path = os.path.join(file_dir, "optimizers")
return load_var_map for _, _, optimizers in os.walk(opt_path):
for optimizer in optimizers:
try:
f = open(os.path.join(opt_path, optimizer), "rb")
load_optimizer_map[optimizer] = pickle.load(f)
f.close()
except IOError:
raise IOError("Can't load %s",
os.path.join(
file_dir, "optimizers",
os.path.normpath(str(optimizer._name))))
if len(load_optimizer_map) == 0:
warnings.warn("No optimizer loaded")
return load_var_map, load_optimizer_map
def _clone_var_in_block_(block, var): def _clone_var_in_block_(block, var):
......
...@@ -63,13 +63,13 @@ class PiecewiseDecay(LearningRateDecay): ...@@ -63,13 +63,13 @@ class PiecewiseDecay(LearningRateDecay):
self.vars = [] self.vars = []
for value in values: for value in values:
self.vars.append(self.create_lr_var(value)) self.vars.append(value)
def step(self): def step(self):
for i in range(len(self.boundaries)): for i in range(len(self.boundaries)):
if self.step_num < self.boundaries[i]: if self.step_num < self.boundaries[i]:
return self.vars[i] return self.vars[i]
return self.vars[len(self.values) - 1] return self.create_lr_var(self.vars[len(self.values) - 1])
class NaturalExpDecay(LearningRateDecay): class NaturalExpDecay(LearningRateDecay):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -16,25 +16,25 @@ from __future__ import print_function ...@@ -16,25 +16,25 @@ from __future__ import print_function
import numpy as np import numpy as np
from collections import defaultdict from collections import defaultdict
from functools import reduce
from paddle.fluid import core
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program
from paddle.fluid.layers import tensor
from . import framework from . import framework
from . import layers from . import layers
from . import unique_name from . import unique_name
from .backward import append_backward from .backward import append_backward
from .clip import append_gradient_clip_ops, error_clip_callback from .clip import append_gradient_clip_ops, error_clip_callback
from .dygraph import base as imperative_base
from .dygraph.learning_rate_scheduler import LearningRateDecay
from .framework import program_guard from .framework import program_guard
from .initializer import Constant from .initializer import Constant
from .layer_helper import LayerHelper from .layer_helper import LayerHelper
from .layers import ops from .layers import ops
from .regularizer import append_regularization_ops from .regularizer import append_regularization_ops
from .dygraph import base as imperative_base
from .dygraph.learning_rate_scheduler import LearningRateDecay
from paddle.fluid import core
from paddle.fluid.layers import tensor
from functools import reduce
from .wrapped_decorator import signature_safe_contextmanager from .wrapped_decorator import signature_safe_contextmanager
__all__ = [ __all__ = [
...@@ -63,14 +63,18 @@ class Optimizer(object): ...@@ -63,14 +63,18 @@ class Optimizer(object):
raise TypeError( raise TypeError(
"learning rate should be float or LearningRateDecay, got %s here" "learning rate should be float or LearningRateDecay, got %s here"
% type(learning_rate)) % type(learning_rate))
if name is not None:
self._name = unique_name.generate(name)
else:
self._name = unique_name.generate(self.__class__.__name__)
else: else:
if not isinstance(learning_rate, float) and \ if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, framework.Variable): not isinstance(learning_rate, framework.Variable):
raise TypeError( raise TypeError(
"learning rate should be float or Variable, got %s here" % "learning rate should be float or Variable, got %s here" %
type(learning_rate)) type(learning_rate))
self._name = name
self._name = name
self.regularization = regularization self.regularization = regularization
self._learning_rate = learning_rate self._learning_rate = learning_rate
# the learning rate type should be inferenced from loss # the learning rate type should be inferenced from loss
...@@ -89,6 +93,90 @@ class Optimizer(object): ...@@ -89,6 +93,90 @@ class Optimizer(object):
self.helper = None self.helper = None
self._opti_name_list = [] self._opti_name_list = []
def load(self, stat_dict):
"""
load optimizer with learning rate decay in dygraph mode
:return: None
Args:
stat_dict: the dict load by load_persistable method
Examples:
.. code-block:: python
from __future__ import print_function
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import FC
from paddle.fluid.dygraph.base import to_variable
class MLP(fluid.Layer):
def __init__(self, name_scope):
super(MLP, self).__init__(name_scope)
self._fc1 = FC(self.full_name(), 10)
self._fc2 = FC(self.full_name(), 10)
def forward(self, inputs):
y = self._fc1(inputs)
y = self._fc2(y)
return y
with fluid.dygraph.guard():
mlp = MLP('mlp')
optimizer2 = SGDOptimizer(
learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
for batch_id, data in enumerate(train_reader()):
dy_x_data = np.array(
[x[0].reshape(1, 28, 28) for x in data]).astype('float32')
y_data = np.array([x[1] for x in data]).astype('int64').reshape(
128, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data)
label._stop_gradient = True
cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost)
avg_loss.backward()
optimizer.minimize(avg_loss)
mlp.clear_gradients()
fluid.dygraph.save_persistables(
mlp.state_dict(), [optimizer, optimizer2], "save_dir_2")
if batch_id == 2:
break
with fluid.dygraph.guard():
mlp_load = MLP('mlp')
optimizer_load2 = SGDOptimizer(
learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
parameters, optimizers = fluid.dygraph.load_persistables(
"save_dir_2")
mlp_load.load_dict(parameters)
optimizer_load2.load(optimizers)
self.assertTrue(optimizer2._learning_rate.__dict__ == optimizer_load2._learning_rate.__dict__)
"""
if framework.in_dygraph_mode():
self._learning_rate = stat_dict[self._name]
else:
raise TypeError("load can only be used under DyGraph mode")
def get_opti_var_name_list(self): def get_opti_var_name_list(self):
return self._opti_name_list return self._opti_name_list
......
...@@ -144,14 +144,14 @@ class TestDygraphCheckpoint(unittest.TestCase): ...@@ -144,14 +144,14 @@ class TestDygraphCheckpoint(unittest.TestCase):
avg_loss.backward() avg_loss.backward()
sgd.minimize(avg_loss) sgd.minimize(avg_loss)
fluid.dygraph.save_persistables(mnist.state_dict(), fluid.dygraph.save_persistables(mnist.state_dict(), [sgd],
"save_dir") "save_dir")
mnist.clear_gradients() mnist.clear_gradients()
for param in mnist.parameters(): for param in mnist.parameters():
dy_param_init_value[param.name] = param.numpy() dy_param_init_value[param.name] = param.numpy()
restore = fluid.dygraph.load_persistables("save_dir") restore, _ = fluid.dygraph.load_persistables("save_dir")
mnist.load_dict(restore) mnist.load_dict(restore)
self.assertEqual(len(dy_param_init_value), len(restore)) self.assertEqual(len(dy_param_init_value), len(restore))
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.optimizer import SGDOptimizer, Adam
from paddle.fluid.dygraph.nn import FC
from paddle.fluid.dygraph.base import to_variable
class MLP(fluid.Layer):
def __init__(self, name_scope):
super(MLP, self).__init__(name_scope)
self._fc1 = FC(self.full_name(), 10)
self._fc2 = FC(self.full_name(), 10)
def forward(self, inputs):
y = self._fc1(inputs)
y = self._fc2(y)
return y
class TestImperativeOptimizerBase(unittest.TestCase):
def setUp(self):
self.batch_num = 20
def get_optimizer(self):
raise NotImplementedError()
def _check_mlp(self):
seed = 90
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
mlp = MLP('mlp')
optimizer = self.get_optimizer()
optimizer2 = SGDOptimizer(
learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
for batch_id, data in enumerate(train_reader()):
dy_x_data = np.array(
[x[0].reshape(1, 28, 28) for x in data]).astype('float32')
y_data = np.array([x[1] for x in data]).astype('int64').reshape(
128, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data)
label._stop_gradient = True
cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost)
avg_loss.backward()
optimizer.minimize(avg_loss)
optimizer2.minimize(avg_loss)
mlp.clear_gradients()
fluid.dygraph.save_persistables(
mlp.state_dict(), [optimizer, optimizer2], "save_dir_2")
if batch_id == 2:
break
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
mlp_load = MLP('mlp')
optimizer_load1 = self.get_optimizer()
optimizer_load2 = SGDOptimizer(
learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
parameters, optimizers = fluid.dygraph.load_persistables(
"save_dir_2")
mlp_load.load_dict(parameters)
optimizer_load1.load(optimizers)
optimizer_load2.load(optimizers)
self.assertTrue(optimizer._learning_rate.__dict__ ==
optimizer_load1._learning_rate.__dict__)
self.assertTrue(optimizer2._learning_rate.__dict__ ==
optimizer_load2._learning_rate.__dict__)
class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
def get_optimizer(self):
bd = [3, 6, 9]
optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
def get_optimizer(self):
optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
return optimizer
def test_adam(self):
self._check_mlp()
class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle))
return optimizer
def test_sgd_cycle(self):
self.cycle = True
self._check_mlp()
def test_sgd(self):
self.cycle = False
self._check_mlp()
class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
learning_rate=0.1, step_each_epoch=10000, epochs=120))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
d_model=512, warmup_steps=8000))
return optimizer
def test_sgd(self):
self._check_mlp()
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册