From eb5d427d3940cd53500fc4003c66ad37ef1738db Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 14 Dec 2018 11:37:39 +0800 Subject: [PATCH] add comment for lazy_mode adam optimizer --- python/paddle/fluid/optimizer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index c53bf4913ad..59c22d4e498 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -641,9 +641,14 @@ class AdamOptimizer(Optimizer): beta1 (float): The exponential decay rate for the 1st moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates. epsilon (float): a small float value for numerical stability. - regularization: A Regularizer, such as - fluid.regularizer.L2DecayRegularizer. + regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. name: A optional name prefix. + lazy_mode(bool: false): The official Adam algorithm has two moving-average accumulators + the accumulators are updated at every step. Every element of the two moving-average is updated + in both dense mode and sparse mode. If the size of parameter is very large, then the update + may be very slow. The lazy mode only update the element that has gradient is the current + mini-batch, so it will be much more faster. But this mode has different semantics with the + original Adam algorithm and may lead to different result. Examples: .. code-block:: python -- GitLab