From 022dfed4fc0abc57fcc60b4b8b506f4019ee4d40 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Thu, 6 Jun 2019 12:47:26 +0800 Subject: [PATCH] Add optimizer save and load (#16986) * save optimizer related vars in dygraph * test=develop, add optimizer save and load * test=develop, add optimizer save and load * test=develop, merge code and add multi-optimizer save and load * test=develop, fix test_imperative_checkpoint * test=develop, fix include error * test=develop, fix include error * test=develop, renew api spec * test=develop, refine code * test=develop, set default value for checkpoint * test=develop, fix ci error * test=develop, change API.spec and make api more readable * test=develop, refine version and time stamp * test=develop, add example code and refine code * test=develop, refine doc * test=develop, change version --- paddle/fluid/API.spec | 13 ++ python/paddle/fluid/dygraph/base.py | 3 + python/paddle/fluid/dygraph/checkpoint.py | 73 +++++-- .../fluid/dygraph/learning_rate_scheduler.py | 4 +- python/paddle/fluid/optimizer.py | 102 ++++++++- .../unittests/test_imperative_checkpoint.py | 4 +- .../test_imperative_save_load_optimizer.py | 196 ++++++++++++++++++ 7 files changed, 372 insertions(+), 23 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_save_load_optimizer.py diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 0a1d2c48b..4ce265860 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -450,54 +450,63 @@ paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'para paddle.fluid.optimizer.SGDOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.SGDOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.SGDOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.MomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.MomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.MomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdagradOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.AdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdagradOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdamOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.AdamOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdamOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdamaxOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdamaxOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.DecayedAdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.FtrlOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.FtrlOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.FtrlOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.RMSPropOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.RMSPropOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.RMSPropOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdadeltaOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.AdadeltaOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdadeltaOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '648010d0ac1fa707dac0b89f74b0e35c')) @@ -505,6 +514,7 @@ paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'para paddle.fluid.optimizer.ModelAverage.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.ModelAverage.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.ModelAverage.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '5f14ea4adda2791e1c3b37ff327f6a83')) paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -512,18 +522,21 @@ paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['sel paddle.fluid.optimizer.LarsMomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.LarsMomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.DGCMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'rampup_begin_step', 'rampup_step', 'sparsity', 'use_nesterov', 'local_grad_clip_norm', 'num_trainers', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1, [0.999], False, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DGCMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.DGCMomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) paddle.fluid.optimizer.DGCMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.DGCMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DGCMomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.LambOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'lamb_weight_decay', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.01, 0.9, 0.999, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.LambOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.LambOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) paddle.fluid.optimizer.LambOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.LambOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.LambOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.LambOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.ExponentialMovingAverage.__init__ (ArgSpec(args=['self', 'decay', 'thres_steps', 'name'], varargs=None, keywords=None, defaults=(0.999, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.ExponentialMovingAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '30f494752ac8921dc5835a63637f453a')) diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index c19a2f926..63f861e38 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -92,3 +92,6 @@ def to_variable(value, block=None, name=None): return py_var elif isinstance(value, framework.Variable): return value + else: + raise TypeError( + "to_variable only accepts 'ndarray' and 'Variable' as value's input") diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py index f96b53e8c..46cdada9e 100644 --- a/python/paddle/fluid/dygraph/checkpoint.py +++ b/python/paddle/fluid/dygraph/checkpoint.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,13 +16,18 @@ from __future__ import print_function import os import collections -from .. import core from ..framework import Variable, default_main_program +import pickle +from . import learning_rate_scheduler +import warnings __all__ = ['save_persistables', 'load_persistables'] -def save_persistables(vardict, dirname, filename=None): +def save_persistables(model_dict, + optimizer=None, + dirname='save_dir', + filename=None): """ This function filters out all variables in layer.parameters from the give `layer` and then trys to load these variables from the folder @@ -34,12 +39,12 @@ def save_persistables(vardict, dirname, filename=None): the file name. Args: - vardict(dict of Parameters): The parameters will + model_dict(dict of Parameters): The parameters will be saved. If it is None, nothing will be deal. dirname(str): The directory path. filename(str|None): The file which saved all variables. If variables were - saved in differnet files, set it to None. + saved in different files, set it to None. Default: None Returns: @@ -71,11 +76,11 @@ def save_persistables(vardict, dirname, filename=None): fluid.dygraph.save_persistables(ptb_model.state_dict(), dirname=param_path, layer=ptb_model) """ - if isinstance(vardict, collections.OrderedDict): - _save_var_to_file(vardict, dirname, filename) + if isinstance(model_dict, collections.OrderedDict): + _save_var_to_file(model_dict, optimizer, dirname, filename) -def load_persistables(dirname): +def load_persistables(dirname='save_dir'): """ This function trys to load persistable variables from the folder `dirname` or the file `filename`. @@ -86,7 +91,8 @@ def load_persistables(dirname): the file name. Args: - dirname(str): The directory path. + dirname(str): The directory path. default is save_dir + optimizer(Optimizer): Optimizer to be saved Returns: dict: The parameter-dict resumed from file @@ -103,7 +109,7 @@ def load_persistables(dirname): return _load_var_from_file(dirname) -def _save_var_to_file(stat_dict, file_dir, file_name): +def _save_var_to_file(stat_dict, optimizers, file_dir, file_name): save_block = default_main_program().global_block() save_var_map = {} for var_key, each_var in stat_dict.items(): @@ -117,6 +123,32 @@ def _save_var_to_file(stat_dict, file_dir, file_name): 'file_path': os.path.join(file_dir, os.path.normpath(each_var.name)) }) + if isinstance(optimizers, (list, tuple)): + optimizers = optimizers + else: + optimizers = [optimizers] + if os.path.exists(os.path.join(file_dir, os.path.normpath("optimizers"))): + pass + else: + os.mkdir(os.path.join(file_dir, os.path.normpath("optimizers"))) + for optimizer in optimizers: + if isinstance(optimizer._learning_rate, + learning_rate_scheduler.LearningRateDecay): + try: + f = open( + os.path.join(file_dir, "optimizers", + os.path.normpath(str(optimizer._name))), "wb") + pickle.dump(optimizer._learning_rate, f, 2) + f.close() + except (): + raise IOError("Can't load %s", + os.path.join( + file_dir, "optimizers", + os.path.normpath(str(optimizer._name)))) + else: + warnings.warn( + "Optimizer not saved, Only optimizer with 'LearningRateDecay' under DyGraph mode need to be saved" + ) if file_name is not None: save_var_list = [] @@ -138,6 +170,8 @@ def _load_var_from_file(file_dir): var_name_list = [] if os.path.exists(base_path): for dirpath, dirnames, filenames in os.walk(base_path): + if "optimizers" in dirpath: + continue pt = dirpath.replace(base_path, "", 1) if pt.startswith("/") or pt.startswith("\\"): pt = pt[1:] @@ -152,6 +186,7 @@ def _load_var_from_file(file_dir): load_block = default_main_program().global_block() load_var_map = {} + load_optimizer_map = {} file_var_list = walk_filename(file_dir) for var_name in file_var_list: new_var = Variable(block=load_block, name=var_name) @@ -165,8 +200,22 @@ def _load_var_from_file(file_dir): }) load_var_map[new_var.name] = new_var - - return load_var_map + opt_path = os.path.join(file_dir, "optimizers") + for _, _, optimizers in os.walk(opt_path): + for optimizer in optimizers: + try: + f = open(os.path.join(opt_path, optimizer), "rb") + load_optimizer_map[optimizer] = pickle.load(f) + f.close() + except IOError: + raise IOError("Can't load %s", + os.path.join( + file_dir, "optimizers", + os.path.normpath(str(optimizer._name)))) + if len(load_optimizer_map) == 0: + warnings.warn("No optimizer loaded") + + return load_var_map, load_optimizer_map def _clone_var_in_block_(block, var): diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py index 3209fa76d..d425d1c25 100644 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -63,13 +63,13 @@ class PiecewiseDecay(LearningRateDecay): self.vars = [] for value in values: - self.vars.append(self.create_lr_var(value)) + self.vars.append(value) def step(self): for i in range(len(self.boundaries)): if self.step_num < self.boundaries[i]: return self.vars[i] - return self.vars[len(self.values) - 1] + return self.create_lr_var(self.vars[len(self.values) - 1]) class NaturalExpDecay(LearningRateDecay): diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index a85344ecf..17ac54e90 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,25 +16,25 @@ from __future__ import print_function import numpy as np from collections import defaultdict -from functools import reduce -from paddle.fluid import core from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program -from paddle.fluid.layers import tensor from . import framework from . import layers from . import unique_name from .backward import append_backward from .clip import append_gradient_clip_ops, error_clip_callback -from .dygraph import base as imperative_base -from .dygraph.learning_rate_scheduler import LearningRateDecay from .framework import program_guard from .initializer import Constant from .layer_helper import LayerHelper from .layers import ops from .regularizer import append_regularization_ops +from .dygraph import base as imperative_base +from .dygraph.learning_rate_scheduler import LearningRateDecay +from paddle.fluid import core +from paddle.fluid.layers import tensor +from functools import reduce from .wrapped_decorator import signature_safe_contextmanager __all__ = [ @@ -63,14 +63,18 @@ class Optimizer(object): raise TypeError( "learning rate should be float or LearningRateDecay, got %s here" % type(learning_rate)) + if name is not None: + self._name = unique_name.generate(name) + else: + self._name = unique_name.generate(self.__class__.__name__) else: if not isinstance(learning_rate, float) and \ not isinstance(learning_rate, framework.Variable): raise TypeError( "learning rate should be float or Variable, got %s here" % type(learning_rate)) + self._name = name - self._name = name self.regularization = regularization self._learning_rate = learning_rate # the learning rate type should be inferenced from loss @@ -89,6 +93,90 @@ class Optimizer(object): self.helper = None self._opti_name_list = [] + def load(self, stat_dict): + """ + load optimizer with learning rate decay in dygraph mode + :return: None + + Args: + stat_dict: the dict load by load_persistable method + + Examples: + + .. code-block:: python + + from __future__ import print_function + import numpy as np + import paddle + import paddle.fluid as fluid + from paddle.fluid.optimizer import SGDOptimizer + from paddle.fluid.dygraph.nn import FC + from paddle.fluid.dygraph.base import to_variable + + class MLP(fluid.Layer): + def __init__(self, name_scope): + super(MLP, self).__init__(name_scope) + + self._fc1 = FC(self.full_name(), 10) + self._fc2 = FC(self.full_name(), 10) + + def forward(self, inputs): + y = self._fc1(inputs) + y = self._fc2(y) + return y + + with fluid.dygraph.guard(): + mlp = MLP('mlp') + optimizer2 = SGDOptimizer( + learning_rate=fluid.layers.natural_exp_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128, drop_last=True) + + for batch_id, data in enumerate(train_reader()): + dy_x_data = np.array( + [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + 128, 1) + + img = to_variable(dy_x_data) + label = to_variable(y_data) + label._stop_gradient = True + cost = mlp(img) + avg_loss = fluid.layers.reduce_mean(cost) + avg_loss.backward() + optimizer.minimize(avg_loss) + mlp.clear_gradients() + fluid.dygraph.save_persistables( + mlp.state_dict(), [optimizer, optimizer2], "save_dir_2") + if batch_id == 2: + break + + with fluid.dygraph.guard(): + mlp_load = MLP('mlp') + optimizer_load2 = SGDOptimizer( + learning_rate=fluid.layers.natural_exp_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + parameters, optimizers = fluid.dygraph.load_persistables( + "save_dir_2") + mlp_load.load_dict(parameters) + optimizer_load2.load(optimizers) + self.assertTrue(optimizer2._learning_rate.__dict__ == optimizer_load2._learning_rate.__dict__) + + """ + if framework.in_dygraph_mode(): + self._learning_rate = stat_dict[self._name] + else: + raise TypeError("load can only be used under DyGraph mode") + def get_opti_var_name_list(self): return self._opti_name_list diff --git a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py index e6ab3840a..6a4330df7 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py @@ -144,14 +144,14 @@ class TestDygraphCheckpoint(unittest.TestCase): avg_loss.backward() sgd.minimize(avg_loss) - fluid.dygraph.save_persistables(mnist.state_dict(), + fluid.dygraph.save_persistables(mnist.state_dict(), [sgd], "save_dir") mnist.clear_gradients() for param in mnist.parameters(): dy_param_init_value[param.name] = param.numpy() - restore = fluid.dygraph.load_persistables("save_dir") + restore, _ = fluid.dygraph.load_persistables("save_dir") mnist.load_dict(restore) self.assertEqual(len(dy_param_init_value), len(restore)) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_optimizer.py new file mode 100644 index 000000000..4aad828b0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_optimizer.py @@ -0,0 +1,196 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle +import paddle.fluid as fluid +from paddle.fluid.optimizer import SGDOptimizer, Adam +from paddle.fluid.dygraph.nn import FC +from paddle.fluid.dygraph.base import to_variable + + +class MLP(fluid.Layer): + def __init__(self, name_scope): + super(MLP, self).__init__(name_scope) + + self._fc1 = FC(self.full_name(), 10) + self._fc2 = FC(self.full_name(), 10) + + def forward(self, inputs): + y = self._fc1(inputs) + y = self._fc2(y) + return y + + +class TestImperativeOptimizerBase(unittest.TestCase): + def setUp(self): + self.batch_num = 20 + + def get_optimizer(self): + raise NotImplementedError() + + def _check_mlp(self): + seed = 90 + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + mlp = MLP('mlp') + optimizer = self.get_optimizer() + optimizer2 = SGDOptimizer( + learning_rate=fluid.layers.natural_exp_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128, drop_last=True) + + for batch_id, data in enumerate(train_reader()): + dy_x_data = np.array( + [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + 128, 1) + + img = to_variable(dy_x_data) + label = to_variable(y_data) + label._stop_gradient = True + + cost = mlp(img) + avg_loss = fluid.layers.reduce_mean(cost) + + avg_loss.backward() + optimizer.minimize(avg_loss) + optimizer2.minimize(avg_loss) + mlp.clear_gradients() + fluid.dygraph.save_persistables( + mlp.state_dict(), [optimizer, optimizer2], "save_dir_2") + if batch_id == 2: + break + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + mlp_load = MLP('mlp') + optimizer_load1 = self.get_optimizer() + optimizer_load2 = SGDOptimizer( + learning_rate=fluid.layers.natural_exp_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + parameters, optimizers = fluid.dygraph.load_persistables( + "save_dir_2") + mlp_load.load_dict(parameters) + optimizer_load1.load(optimizers) + optimizer_load2.load(optimizers) + + self.assertTrue(optimizer._learning_rate.__dict__ == + optimizer_load1._learning_rate.__dict__) + self.assertTrue(optimizer2._learning_rate.__dict__ == + optimizer_load2._learning_rate.__dict__) + + +class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + bd = [3, 6, 9] + optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)])) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_adam(self): + self._check_mlp() + + +class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay( + learning_rate=0.1, decay_steps=5, cycle=self.cycle)) + return optimizer + + def test_sgd_cycle(self): + self.cycle = True + self._check_mlp() + + def test_sgd(self): + self.cycle = False + self._check_mlp() + + +class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay( + learning_rate=0.1, step_each_epoch=10000, epochs=120)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay( + d_model=512, warmup_steps=8000)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +if __name__ == '__main__': + unittest.main() -- GitLab