From 98acfe97ec8bfc952e4a105ba3dd35e6ab5b0dd1 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 9 Oct 2019 21:16:46 +0800 Subject: [PATCH] Polish English APIs' doc of several Optimizers (#20166) * polish minimize en doc * polish adam optimizer en doc * polish adamax optimizer en doc * polish adagrad and decayed adagrad optimizer en doc * polish model average en doc, test=develop, test=document_fix, test=document_preview * self review and further polishing doc * update API.spec, test=develop, test=document_fix * update fluid.data api in examples, test=develop, test=document_fix * update fluid.data inferface, test=develop, test=document_fix * replace -1 by none, test=document_fix --- paddle/fluid/API.spec | 70 +++--- python/paddle/fluid/optimizer.py | 402 +++++++++++++++++++++---------- 2 files changed, 311 insertions(+), 161 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 1163cd87fd8..c522e63fc12 100755 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -912,116 +912,116 @@ paddle.fluid.optimizer.SGDOptimizer ('paddle.fluid.optimizer.SGDOptimizer', ('do paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.SGDOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) -paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'd2a59fb4c678a2feb231fc5b1adcc9b4')) paddle.fluid.optimizer.SGDOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.SGDOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) -paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304')) paddle.fluid.optimizer.MomentumOptimizer ('paddle.fluid.optimizer.MomentumOptimizer', ('document', 'a72bd02e5459e64596897d190413d449')) paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.MomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) -paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'd2a59fb4c678a2feb231fc5b1adcc9b4')) paddle.fluid.optimizer.MomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.MomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) -paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) -paddle.fluid.optimizer.AdagradOptimizer ('paddle.fluid.optimizer.AdagradOptimizer', ('document', 'a1d4f0682cde43ad34432b1338aadf04')) +paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304')) +paddle.fluid.optimizer.AdagradOptimizer ('paddle.fluid.optimizer.AdagradOptimizer', ('document', 'b6508a25326275d44e658dd73bcd5593')) paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.AdagradOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) -paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'd2a59fb4c678a2feb231fc5b1adcc9b4')) paddle.fluid.optimizer.AdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdagradOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) -paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) -paddle.fluid.optimizer.AdamOptimizer ('paddle.fluid.optimizer.AdamOptimizer', ('document', '6fe871b955cab6e267422d5af666dafa')) +paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304')) +paddle.fluid.optimizer.AdamOptimizer ('paddle.fluid.optimizer.AdamOptimizer', ('document', '34e694895a702ba18a7b5ae618458217')) paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.AdamOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) -paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'd2a59fb4c678a2feb231fc5b1adcc9b4')) paddle.fluid.optimizer.AdamOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) -paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) -paddle.fluid.optimizer.AdamaxOptimizer ('paddle.fluid.optimizer.AdamaxOptimizer', ('document', '883fc4541214e8343d3a89711936e15d')) +paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304')) +paddle.fluid.optimizer.AdamaxOptimizer ('paddle.fluid.optimizer.AdamaxOptimizer', ('document', '515bae60aa82e7fbd1046f59e56549bb')) paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.AdamaxOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) -paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'd2a59fb4c678a2feb231fc5b1adcc9b4')) paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamaxOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) -paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304')) paddle.fluid.optimizer.DpsgdOptimizer ('paddle.fluid.optimizer.DpsgdOptimizer', ('document', '71113c30b66c0f4035b10ebd8af8c5ad')) paddle.fluid.optimizer.DpsgdOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'clip', 'batch_size', 'sigma'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DpsgdOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.DpsgdOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) -paddle.fluid.optimizer.DpsgdOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.DpsgdOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'd2a59fb4c678a2feb231fc5b1adcc9b4')) paddle.fluid.optimizer.DpsgdOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DpsgdOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) -paddle.fluid.optimizer.DpsgdOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) -paddle.fluid.optimizer.DecayedAdagradOptimizer ('paddle.fluid.optimizer.DecayedAdagradOptimizer', ('document', 'e76838a8586bf2e58e6b5cdd2f67f780')) +paddle.fluid.optimizer.DpsgdOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304')) +paddle.fluid.optimizer.DecayedAdagradOptimizer ('paddle.fluid.optimizer.DecayedAdagradOptimizer', ('document', '6f5adb9f881a3b182236e344033dbd44')) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) -paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'd2a59fb4c678a2feb231fc5b1adcc9b4')) paddle.fluid.optimizer.DecayedAdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DecayedAdagradOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) -paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304')) paddle.fluid.optimizer.FtrlOptimizer ('paddle.fluid.optimizer.FtrlOptimizer', ('document', 'cba8aae0a267b9a4d8833ae79a00fc55')) paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.FtrlOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) -paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'd2a59fb4c678a2feb231fc5b1adcc9b4')) paddle.fluid.optimizer.FtrlOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.FtrlOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) -paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304')) paddle.fluid.optimizer.RMSPropOptimizer ('paddle.fluid.optimizer.RMSPropOptimizer', ('document', '5217bc4fc399010021d6b70541005780')) paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.RMSPropOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) -paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'd2a59fb4c678a2feb231fc5b1adcc9b4')) paddle.fluid.optimizer.RMSPropOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.RMSPropOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) -paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304')) paddle.fluid.optimizer.AdadeltaOptimizer ('paddle.fluid.optimizer.AdadeltaOptimizer', ('document', 'f4354aef5e3b9134fa68919b75a3a097')) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.AdadeltaOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) -paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'd2a59fb4c678a2feb231fc5b1adcc9b4')) paddle.fluid.optimizer.AdadeltaOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdadeltaOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) -paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) -paddle.fluid.optimizer.ModelAverage ('paddle.fluid.optimizer.ModelAverage', ('document', '0a0adcd60230630e21fe1ef46362dbc0')) +paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304')) +paddle.fluid.optimizer.ModelAverage ('paddle.fluid.optimizer.ModelAverage', ('document', 'e039a4b422ce5b360b4d777481d64975')) paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '648010d0ac1fa707dac0b89f74b0e35c')) +paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '582c279ec4792edf2d95a3064578da7b')) paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.ModelAverage.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) -paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'd2a59fb4c678a2feb231fc5b1adcc9b4')) paddle.fluid.optimizer.ModelAverage.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.ModelAverage.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) -paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) -paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '5f14ea4adda2791e1c3b37ff327f6a83')) +paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304')) +paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '7917cbe4d3ed7954ae73360fbccc39f6')) paddle.fluid.optimizer.LarsMomentumOptimizer ('paddle.fluid.optimizer.LarsMomentumOptimizer', ('document', '030b9092a96a409b1bf5446bf45d0659')) paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.LarsMomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) -paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'd2a59fb4c678a2feb231fc5b1adcc9b4')) paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.LarsMomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) -paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304')) paddle.fluid.optimizer.DGCMomentumOptimizer ('paddle.fluid.optimizer.DGCMomentumOptimizer', ('document', 'facdbef1b4871d0cf74c736ff2e94720')) paddle.fluid.optimizer.DGCMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'rampup_begin_step', 'rampup_step', 'sparsity', 'use_nesterov', 'local_grad_clip_norm', 'num_trainers', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1, [0.999], False, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DGCMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.DGCMomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) -paddle.fluid.optimizer.DGCMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.DGCMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'd2a59fb4c678a2feb231fc5b1adcc9b4')) paddle.fluid.optimizer.DGCMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DGCMomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) -paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304')) paddle.fluid.optimizer.LambOptimizer ('paddle.fluid.optimizer.LambOptimizer', ('document', '7dd8b270156a52f1f6b4663336960893')) paddle.fluid.optimizer.LambOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'lamb_weight_decay', 'beta1', 'beta2', 'epsilon', 'regularization', 'exclude_from_weight_decay_fn', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.01, 0.9, 0.999, 1e-06, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.LambOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.LambOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) -paddle.fluid.optimizer.LambOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.LambOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'd2a59fb4c678a2feb231fc5b1adcc9b4')) paddle.fluid.optimizer.LambOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.LambOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) -paddle.fluid.optimizer.LambOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.LambOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304')) paddle.fluid.optimizer.ExponentialMovingAverage ('paddle.fluid.optimizer.ExponentialMovingAverage', ('document', 'a38b7d5b9f17a295ed15d4c1b9ab4cd0')) paddle.fluid.optimizer.ExponentialMovingAverage.__init__ (ArgSpec(args=['self', 'decay', 'thres_steps', 'name'], varargs=None, keywords=None, defaults=(0.999, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.ExponentialMovingAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '30f494752ac8921dc5835a63637f453a')) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 9c0c29481e3..3e9c9431595 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -449,23 +449,28 @@ class Optimizer(object): no_grad_set=None, callbacks=None): """ - First part of `minimize`, do auto-diff to append backward ops for + The first part of ``minimize``, do auto-diff to append backward operations for the current program. Args: - loss (Variable): loss variable to run optimizations. - startup_program (Program): startup_program for initializing parameters - in `parameter_list`. - parameter_list (list): list of Variables to update. - no_grad_set (set|None): set of Variables should be ignored. - callbacks (list|None): list of callables to run when appending backward - operator for one parameter. + loss (Variable): ``loss`` variable to run optimizations. + startup_program (Program, optional): :ref:`api_fluid_Program` for + initializing parameters in ``parameter_list``. The default value + is None, at this time :ref:`api_fluid_default_startup_program` will be used. + parameter_list (list, optional): List of ``Variable`` names to update + to minimize ``loss``. The default value is None, at this time all parameters + will be updated. + no_grad_set (set, optional): Set of ``Variable`` objects that don't need + to be updated. The default value is None. + callbacks (list, optional): list of callable objects to run when appending backward + operator for one parameter. The default value is None. Return: - list: list of (param, grad) pair, grad is the output of backward. + list: list of (param, grad) variable pairs, param is ``Parameter``, + grad is the gradient value corresponding to the parameter. Examples: - See examples in `apply_gradients`. + See examples in ``apply_gradients``. """ no_grad_set = self._get_no_grad_set(loss, no_grad_set) @@ -597,22 +602,30 @@ class Optimizer(object): no_grad_set=None, grad_clip=None): """ - Add operations to minimize `loss` by updating `parameter_list`. - - This method combines interface `backward()` and - `apply_gradients()` into one. + Add operations to minimize ``loss`` by updating ``parameter_list``. Args: - loss (Variable): loss variable to run optimizations. - startup_program (Program): startup_program for initializing parameters - in `parameter_list`. - parameter_list (list): list of Variables to update. - no_grad_set (set|None): set of Variables should be ignored. - grad_clip (GradClipBase|None) : Gradient clip strategy + loss (Variable): A ``Variable`` containing the value to minimize. + startup_program (Program, optional): :ref:`api_fluid_Program` for + initializing parameters in ``parameter_list``. The default value + is None, at this time :ref:`api_fluid_default_startup_program` will be used. + parameter_list (list, optional): List of ``Variable`` names to update + to minimize ``loss``. The default value is None, at this time all parameters + will be updated. + no_grad_set (set, optional): Set of ``Variable`` objects that don't need + to be updated. The default value is None. + grad_clip (GradClipBase, optional) : Gradient clipping strategy, static + graph mode does not need to use this argument. Currently, this argument + only supports gradient clipping in dygraph mode. In the future, this + argument my be adjusted. The default value is None. Returns: - tuple: (optimize_ops, params_grads) which are, list of operators appended; - and list of (param, grad) Variables pair for optimization. + tuple: tuple (optimize_ops, params_grads), A list of operators appended + by minimize and a list of (param, grad) variable pairs, param is + ``Parameter``, grad is the gradient value corresponding to the parameter. + + Examples: + Please refer to the example of current Optimizer. """ assert isinstance(loss, Variable), "The loss should be an Variable." params_grads = self.backward( @@ -1173,9 +1186,10 @@ class LarsMomentumOptimizer(Optimizer): class AdagradOptimizer(Optimizer): """ - **Adaptive Gradient Algorithm (Adagrad)** + The Adaptive Gradient optimizer (Adagrad for short) can adaptively assign + different learning rates to individual parameters. - The update is done as follows: + The parameter ``param_out`` update rule with gradient ``grad``: .. math:: @@ -1183,32 +1197,38 @@ class AdagradOptimizer(Optimizer): param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} - The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) - does not have the epsilon attribute. It is added here in our implementation - as also proposed here: http://cs231n.github.io/neural-networks-3/#ada + Related paper: `Adaptive Subgradient Methods for Online Learning and + Stochastic Optimization `_. + + The original paper does not have the ``epsilon`` attribute. It is added here + in our implementation as also proposed `Per-parameter adaptive learning rate + methods `_ for numerical stability to avoid the division by zero error. Args: - learning_rate (float|Variable): the learning rate used to update parameters. \ - Can be a float value or a Variable with one float value as data element. - epsilon (float): a small float value for numerical stability. - regularization: A Regularizer, such as - fluid.regularizer.L2DecayRegularizer. - name: A optional name prefix. - initial_accumulator_value (float): Initial value for moment accumulator. + learning_rate (float|Variable): The learning rate used to update ``Parameter``. + It can be a float value or a ``Variable`` with a float type. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-06. + regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as + :ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + initial_accumulator_value (float, optional): Initial value for moment accumulator. + The default value is 0.0. Examples: .. code-block:: python - import paddle.fluid as fluid import numpy as np + import paddle.fluid as fluid np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - inp = fluid.layers.data( - name="inp", shape=[2, 2], append_batch_size=False) + inp = fluid.data(name="inp", shape=[2, 2]) out = fluid.layers.fc(inp, size=3) out = fluid.layers.reduce_sum(out) - optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) + optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.2) optimizer.minimize(out) exe = fluid.Executor(fluid.CPUPlace()) @@ -1276,12 +1296,12 @@ class AdagradOptimizer(Optimizer): class AdamOptimizer(Optimizer): """ - This implements the Adam optimizer from Section 2 of the Adam - paper : https://arxiv.org/abs/1412.6980. - Adam is a first-order gradient-based optimization method based on - adaptive estimates of lower-order moments. - - Adam updates: + The Adam optimzier uses an optimization described at the end + of section 2 of `Adam paper `_ , + it can dynamically adjusts the learning rate of each parameter using + the 1st moment estimates and the 2nd moment estimates of the gradient. + + The parameter ``param_out`` update rule with gradient ``grad``: .. math:: @@ -1296,20 +1316,29 @@ class AdamOptimizer(Optimizer): param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + Related paper: `Adam: A Method for Stochastic Optimization `_ + Args: - learning_rate (float|Variable): the learning rate used to update parameters. \ - Can be a float value or a Variable with one float value as data element. - beta1 (float): The exponential decay rate for the 1st moment estimates. - beta2 (float): The exponential decay rate for the 2nd moment estimates. - epsilon (float): a small float value for numerical stability. - regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. - name: A optional name prefix. - lazy_mode(bool: false): The official Adam algorithm has two moving-average accumulators - the accumulators are updated at every step. Every element of the two moving-average is updated - in both dense mode and sparse mode. If the size of parameter is very large, then the update - may be very slow. The lazy mode only update the element that has gradient is the current - mini-batch, so it will be much more faster. But this mode has different semantics with the - original Adam algorithm and may lead to different result. + learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``. + It can be a float value or a ``Variable`` with a float type. The default value is 0.001. + beta1 (float, optional): The exponential decay rate for the 1st moment estimates. + The default value is 0.9. + beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. + The default value is 0.999. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-08. + regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as + :ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators. + The accumulators are updated at every step. Every element of the two moving-average + is updated in both dense mode and sparse mode. If the size of parameter is very large, + then the update may be very slow. The lazy mode only update the element that has + gradient in current mini-batch, so it will be much more faster. But this mode has + different semantics with the original Adam algorithm and may lead to different result. + The default value is False. Examples: .. code-block:: python @@ -1320,8 +1349,8 @@ class AdamOptimizer(Optimizer): place = fluid.CPUPlace() main = fluid.Program() with fluid.program_guard(main): - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y = fluid.layers.data(name='y', shape=[1], dtype='float32') + x = fluid.data(name='x', shape=[None, 13], dtype='float32') + y = fluid.data(name='y', shape=[None, 1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) @@ -1457,11 +1486,12 @@ class AdamOptimizer(Optimizer): class AdamaxOptimizer(Optimizer): """ - We implement the Adamax optimizer from Section 7 of the Adam - paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the - Adam algorithm based on the infinity norm. + The Adamax optimizer is implemented based on the Adamax Optimization + in Section 7 of `Adam paper `_. + The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm, + which makes the learning rate update algorithm more stable and simple. - Adamax updates: + The parameter ``param_out`` update rule with gradient ``grad``: .. math:: @@ -1475,10 +1505,28 @@ class AdamaxOptimizer(Optimizer): param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out} + Related paper: `Adam: A Method for Stochastic Optimization `_ + + The original paper does not have an ``epsilon`` attribute, + it is added here for numerical stability to prevent the division by 0 error. - The original paper does not have an epsilon attribute. - However, it is added here for numerical stability to prevent the - division by 0 error. + Args: + learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``. + It can be a float value or a ``Variable`` with a float type. The default value is 0.001. + beta1 (float, optional): The exponential decay rate for the 1st moment estimates. + The default value is 0.9. + beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. + The default value is 0.999. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-08. + regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as + :ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + + **Notes**: + **Currently, AdamaxOptimizer doesn't support sparse parameter optimization.** Examples: .. code-block:: python @@ -1493,10 +1541,10 @@ class AdamaxOptimizer(Optimizer): train_program = fluid.Program() startup_program = fluid.Program() with fluid.program_guard(train_program, startup_program): - data = fluid.layers.data(name='X', shape=[1], dtype='float32') + data = fluid.data(name='X', shape=[None, 1], dtype='float32') hidden = fluid.layers.fc(input=data, size=10) loss = fluid.layers.mean(hidden) - adam = fluid.optimizer.Adamax(learning_rate=0.2) + adam = fluid.optimizer.AdamaxOptimizer(learning_rate=0.2) adam.minimize(loss) # Run the startup program once and only once. @@ -1506,19 +1554,6 @@ class AdamaxOptimizer(Optimizer): outs = exe.run(program=train_program, feed={'X': x}, fetch_list=[loss.name]) - - Args: - learning_rate (float|Variable): the learning rate used to update parameters. \ - Can be a float value or a Variable with one float value as data element. - beta1 (float): The exponential decay rate for the 1st moment estimates. - beta2 (float): The exponential decay rate for the 2nd moment estimates. - epsilon (float): a small float value for numerical stability. - regularization: A Regularizer, such as - fluid.regularizer.L2DecayRegularizer. - name: A optional name prefix. - - Notes: - Currently, AdamaxOptimizer doesn't support sparse parameter optimization. """ _moment_acc_str = "moment" _inf_norm_acc_str = "inf_norm" @@ -1690,11 +1725,11 @@ class DpsgdOptimizer(Optimizer): class DecayedAdagradOptimizer(Optimizer): """ - **Decayed Adagrad Optimizer** - - The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) + The Decayed Adagrad optimizer can be seen as an Adagrad algorithm that introduces + the decay rate to solve the problem of a sharp drop in the learning rate + during model training when using the AdagradOptimizer. - The update is done as follows: + The parameter ``param_out`` update rule with gradient ``grad``: .. math:: @@ -1702,34 +1737,37 @@ class DecayedAdagradOptimizer(Optimizer): param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} - The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) - does not have an epsilon attribute. It is added here for numerical + Related paper: `Adaptive Subgradient Methods for Online Learning and Stochastic + Optimization `_. + + The original paper does not have an ``epsilon`` attribute. It is added here for numerical stability to avoid the division by zero error. Args: - learning_rate (float|Variable): the learning rate used to update parameters. \ - Can be a float value or a Variable with one float value as data element. - decay (float): decay rate. - epsilon (float): a small float value for numerical stability. - regularization: A Regularizer, such as - fluid.regularizer.L2DecayRegularizer. - name: A optional name prefix. + learning_rate (float|Variable): The learning rate used to update ``Parameter``. + It can be a float value or a ``Variable`` with a float type. + decay (float, optional): The decay rate. The default value is 0.95. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-06. + regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as + :ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + + **Notes**: + **Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.** Examples: .. code-block:: python import paddle.fluid as fluid - import paddle.fluid.layers as layers - from paddle.fluid.optimizer import DecayedAdagrad - x = layers.data( name='x', shape=[-1, 10], dtype='float32' ) - trans = layers.fc( x, 100 ) - cost = layers.reduce_mean( trans ) - optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2) + x = fluid.data( name='x', shape=[None, 10], dtype='float32' ) + trans = fluid.layers.fc( x, 100 ) + cost = fluid.layers.reduce_mean( trans ) + optimizer = fluid.optimizer.DecayedAdagradOptimizer(learning_rate=0.2) optimizer.minimize(cost) - - Notes: - Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization. """ _moment_acc_str = "moment" @@ -2359,21 +2397,45 @@ Lamb = LambOptimizer class ModelAverage(Optimizer): - """Accumulate the average of parameters within sliding window. The average - result will be saved in temporary variables which can be applied to - parameter variables of current model by calling 'apply()' method. And the - 'restore()' method is used to restore the parameter values of current model. + """ + The ModelAverage optimizer accumulates specific continuous historical parameters + during training. The accumulated historical range can be controlled by the passed + ``average_window_rate`` argument. The averaged ``Parameter`` are used in the prediction, + which usually can improve the accuracy of the prediction. + + Accumulate the average of the ``Parameter`` in the sliding window, the result will be saved + in a temporary variable, can be applied to the current model's ``Parameter`` by calling + the ``apply()`` method, and the current model ``Parameter`` can be restored by calling + the ``restore()`` method. + + The window size for calculating the average is determined by ``average_window_rate``, + ``min_average_window``, ``max_average_window`` and the current ``Parameter`` update times (num_updates). - The size of average window is determined by average_window_rate, - min_average_window, max_average_window and current update times. + When the cumulative times (num_accumulates) is greater than the specific window + threshold (average_window), the accumulated ``Parameter`` temporary variable is set to 0.0. + The following example will help to understand the role of these arguments: + + :: + + if num_accumulates >= min_average_window and num_accumulates >= min(max_average_window, num_updates * average_window_rate): + num_accumulates = 0 + + In the above conditional judgment statement, ``num_accumulates`` indicates the current + accumulated number, which can be abstractly understood as the length of the cumulative window. + The length of the window must be at least the length set by the ``min_average_window`` argument, + and cannot exceed the length specified by the ``max_average_window`` argument or + ``num_updates * average_window_rate``, where ``num_updates`` indicates the current ``Parameter`` + update times, ``average_window_rate`` is a coefficient that calculates the length of the window. Args: - average_window_rate: The rate of average window. - min_average_window: The minimum size of average window. - max_average_window: The maximum size of average window. - regularization: A Regularizer, such as - fluid.regularizer.L2DecayRegularizer. - name: A optional name prefix. + average_window_rate (float): The calculate ratio of the window length relative to ``Parameter`` update times. + min_average_window (int, optional): the minimum size of average window length. The default value is 10000. + max_average_window (int, optional): The maximum size of average window length. The default value is 10000. + regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as + :ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. Examples: @@ -2390,7 +2452,7 @@ class ModelAverage(Optimizer): startup_program = fluid.Program() with fluid.program_guard(train_program, startup_program): # build net - data = fluid.layers.data(name='X', shape=[1], dtype='float32') + data = fluid.data(name='X', shape=[None, 1], dtype='float32') hidden = fluid.layers.fc(input=data, size=10) loss = fluid.layers.mean(hidden) optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1) @@ -2399,13 +2461,14 @@ class ModelAverage(Optimizer): # build ModelAverage optimizer model_average = fluid.optimizer.ModelAverage(0.15, min_average_window=10000, - max_average_window=20000) + max_average_window=12500) exe.run(startup_program) - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) + for i in range(12500): + x = numpy.random.random(size=(10, 1)).astype('float32') + outs = exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) # apply ModelAverage with model_average.apply(exe): @@ -2526,11 +2589,54 @@ class ModelAverage(Optimizer): @signature_safe_contextmanager def apply(self, executor, need_restore=True): - """Apply average values to parameters of current model. + """ + Apply the average of the cumulative ``Parameter`` to the parameters of the current model. Args: - executor(fluid.Executor): current executor. - need_restore(bool): If you finally need to do restore, set it to True. Default is True. + executor(fluid.Executor): The current network executor. + need_restore(bool): Restore flag variable, if set to True, the network will restore + the parameters of the network to the default value, if set to False, + it will not be restored. The default value is True. + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + import numpy + + # First create the Executor. + place = fluid.CPUPlace() # fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + # build net + data = fluid.data(name='X', shape=[None, 1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1) + optimizer.minimize(loss) + + # build ModelAverage optimizer + model_average = fluid.optimizer.ModelAverage(0.15, + min_average_window=10000, + max_average_window=12500) + + exe.run(startup_program) + for i in range(12500): + x = numpy.random.random(size=(10, 1)).astype('float32') + outs = exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + + # apply ModelAverage + with model_average.apply(exe): + x = numpy.random.random(size=(10, 1)).astype('float32') + exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) """ executor.run(self.apply_program) try: @@ -2540,10 +2646,54 @@ class ModelAverage(Optimizer): self.restore(executor) def restore(self, executor): - """Restore parameter values of current model. + """ + Restore ``Parameter`` values of current model. Args: - executor(fluid.Executor): current executor. + executor(fluid.Executor): The current network executor. + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + import numpy + + # First create the Executor. + place = fluid.CPUPlace() # fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + # build net + data = fluid.data(name='X', shape=[None, 1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1) + optimizer.minimize(loss) + + # build ModelAverage optimizer + model_average = fluid.optimizer.ModelAverage(0.15, + min_average_window=10000, + max_average_window=12500) + + exe.run(startup_program) + for i in range(12500): + x = numpy.random.random(size=(10, 1)).astype('float32') + outs = exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + + # apply ModelAverage + with model_average.apply(exe, False): + x = numpy.random.random(size=(10, 1)).astype('float32') + exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + + # restore Parameters + model_average.restore(exe) """ executor.run(self.restore_program) -- GitLab