optimizers.py 13.5 KB
Newer Older
1
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
Z
zhangjinchao01 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from paddle.trainer.config_parser import Settings, default_decay_rate, \
    default_gradient_clipping_threshold, default_momentum

from .default_decorators import wrap_param_default

Q
qijun 已提交
20 21 22 23 24 25
__all__ = [
    'Optimizer', 'BaseSGDOptimizer', 'MomentumOptimizer', 'AdamaxOptimizer',
    'AdamOptimizer', 'AdaGradOptimizer', 'RMSPropOptimizer',
    'DecayedAdaGradOptimizer', 'AdaDeltaOptimizer', 'BaseRegularization',
    'L2Regularization', 'settings', 'ModelAverage'
]
Z
zhangjinchao01 已提交
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64


class Optimizer(object):
    def to_setting_kwargs(self):
        raise NotImplementedError()

    def extra_settings(self):
        pass

    @property
    def is_support_sparse(self):
        return True


class BaseSGDOptimizer(Optimizer):
    """
    SGD Optimizer.

    SGD is an optimization method, trying to find a neural network that
    minimize the "cost/error" of it by iteration. In paddle's implementation
    SGD Optimizer is synchronized, which means all gradients will be wait to
    calculate and reduced into one gradient, then do optimize operation.

    The neural network consider the learning problem of minimizing an objective
    function, that has the form of a sum

    ..  math::

        Q(w) = \\sum_{i}^{n} Q_i(w)

    The value of function Q sometimes is the cost of neural network (Mean
    Square Error between prediction and label for example). The function Q is
    parametrised by w, the weight/bias of neural network. And weights is what to
    be learned. The i is the i-th observation in (trainning) data.

    So, the SGD method will optimize the weight by

    ..  math::

L
luotao02 已提交
65
        w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
Z
zhangjinchao01 已提交
66 67 68 69 70 71 72 73 74

    where :math:`\\eta` is learning rate. And :math:`n` is batch size.
    """

    def to_setting_kwargs(self):
        raise NotImplementedError()


class MomentumOptimizer(BaseSGDOptimizer):
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
    """
    MomentumOptimizer.

    When sparse=True, the update scheme:

    ..  math::

        \\alpha_t &= \\alpha_{t-1} / k \\\\
        \\beta_t &= \\beta_{t-1} / (1 + \\lambda \\gamma_t) \\\\
        u_t &= u_{t-1} - \\alpha_t \\gamma_t g_t \\\\
        v_t &= v_{t-1} + \\tau_{t-1} \\alpha_t \\gamma_t g_t \\\\
        \\tau_t &= \\tau_{t-1} + \\beta_t / \\alpha_t
    
    where :math:`k` is momentum, :math:`\\lambda` is decay rate, 
    :math:`\\gamma_t` is learning rate at the t'th step.

    :param sparse: with sparse support or not.
    :type sparse: bool
    """
Q
qijun 已提交
94

Z
zhangjinchao01 已提交
95 96 97 98
    def extra_settings(self):
        default_momentum(self.momentum)

    def to_setting_kwargs(self):
99
        if self.sparse:
Q
qijun 已提交
100
            return {'learning_method': 'sparse_momentum'}
101
        else:
Q
qijun 已提交
102
            return {'learning_method': 'momentum'}
103 104

    def __init__(self, momentum=None, sparse=False):
Z
zhangjinchao01 已提交
105
        self.momentum = momentum
106
        self.sparse = sparse
Z
zhangjinchao01 已提交
107 108 109 110 111 112 113 114 115 116


class AdamOptimizer(BaseSGDOptimizer):
    """
    Adam optimizer.
    The details of please refer `Adam: A Method for Stochastic Optimization
    <https://arxiv.org/abs/1412.6980>`_

    ..  math::

L
luotao02 已提交
117 118 119
        m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
        v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
Z
zhangjinchao01 已提交
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153

    :param beta1: the :math:`\\beta_1` in equation.
    :type beta1: float
    :param beta2: the :math:`\\beta_2` in equation.
    :type beta2: float
    :param epsilon: the :math:`\\epsilon` in equation. It is used to prevent
                        divided by zero.
    :type epsilon: float
    """

    @property
    def is_support_sparse(self):
        return False

    def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon

    def to_setting_kwargs(self):
        return {
            'learning_method': 'adam',
            'adam_beta1': self.beta1,
            'adam_beta2': self.beta2,
            'adam_epsilon': self.epsilon
        }


class AdamaxOptimizer(BaseSGDOptimizer):
    """
    Adamax optimizer.

    The details of please refer this `Adam: A Method for Stochastic Optimization
    <https://arxiv.org/abs/1412.6980>`_
L
luotao02 已提交
154

Z
zhangjinchao01 已提交
155 156
    ..  math::

L
luotao02 已提交
157 158 159
        m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
        u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
        w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
Z
zhangjinchao01 已提交
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193

    :param beta1: the :math:`\\beta_1` in the equation.
    :type beta1: float
    :param beta2: the :math:`\\beta_2` in the equation.
    :type beta2: float
    """

    def __init__(self, beta1, beta2):
        self.beta1 = beta1
        self.beta2 = beta2

    def to_setting_kwargs(self):
        return {
            'learning_method': 'adamax',
            'adam_beta1': self.beta1,
            'adam_beta2': self.beta2
        }

    @property
    def is_support_sparse(self):
        return False


class AdaGradOptimizer(BaseSGDOptimizer):
    """
    Adagrad(for ADAptive GRAdient algorithm) optimizer.

    For details please refer this `Adaptive Subgradient Methods for
    Online Learning and Stochastic Optimization
    <http://www.magicbroom.info/Papers/DuchiHaSi10.pdf>`_.

    ..  math::

        G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
L
luotao02 已提交
194
        w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
Z
zhangjinchao01 已提交
195 196 197
    """

    def to_setting_kwargs(self):
Q
qijun 已提交
198
        return {'learning_method': 'adagrad'}
Z
zhangjinchao01 已提交
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213

    def __init__(self):
        pass


class RMSPropOptimizer(BaseSGDOptimizer):
    """
    RMSProp(for Root Mean Square Propagation) optimizer. For details please
    refer this `slide <http://www.cs.toronto.edu/~tijmen/csc321/slides/
    lecture_slides_lec6.pdf>`_.

    The equations of this method as follows:

    ..  math::

L
luotao02 已提交
214 215
        v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
        w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
Z
zhangjinchao01 已提交
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309

    :param rho: the :math:`\\rho` in the equation. The forgetting factor.
    :type rho: float
    :param epsilon: the :math:`\\epsilon` in the equation.
    :type epsilon: float
    """

    def to_setting_kwargs(self):
        return {
            'learning_method': 'rmsprop',
            'ada_rou': self.rho,
            'ada_epsilon': self.epsilon
        }

    def __init__(self, rho=0.95, epsilon=1e-6):
        self.rho = rho
        self.epsilon = epsilon


class DecayedAdaGradOptimizer(BaseSGDOptimizer):
    """
    AdaGrad method with decayed sum gradients. The equations of this method
    show as follow.

    ..  math::

        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
        learning\\_rate &= 1/sqrt( ( E(g_t^2) + \\epsilon )

    :param rho: The :math:`\\rho` parameter in that equation
    :type rho: float
    :param epsilon: The :math:`\\epsilon` parameter in that equation.
    :type epsilon: float
    """

    def to_setting_kwargs(self):
        return {
            'learning_method': 'decayed_adagrad',
            'ada_rou': self.rho,
            'ada_epsilon': self.epsilon
        }

    def __init__(self, rho=0.95, epsilon=1e-6):
        self.rho = rho
        self.epsilon = epsilon


class AdaDeltaOptimizer(BaseSGDOptimizer):
    """
    AdaDelta method. The details of adadelta please refer to this
    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
    <http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.

    ..  math::

        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
        learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\
                          E(g_t^2) + \\epsilon ) ) \\\\
        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2

    :param rho: :math:`\\rho` in equation
    :type rho: float
    :param epsilon: :math:`\\rho` in equation
    :type epsilon: float
    """

    def to_setting_kwargs(self):
        return {
            'learning_method': 'adadelta',
            'ada_rou': self.rho,
            'ada_epsilon': self.epsilon
        }

    def __init__(self, rho=0.95, epsilon=1e-6):
        self.rho = rho
        self.epsilon = epsilon


class BaseRegularization(Optimizer):
    def __init__(self):
        self.algorithm = ""
        self.learning_method = ""

    def to_setting_kwargs(self):
        return {}


class L2Regularization(BaseRegularization):
    def __init__(self, rate):
        super(L2Regularization, self).__init__()
        self.decay_rate = rate

    def to_setting_kwargs(self):
        if self.algorithm == 'owlqn':
Q
qijun 已提交
310
            return {'l2weight': self.decay_rate}
Z
zhangjinchao01 已提交
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
        else:
            return dict()

    def extra_settings(self):
        if self.algorithm == 'sgd' or self.algorithm == 'async_sgd':
            default_decay_rate(self.decay_rate)


class ModelAverage(Optimizer):
    def to_setting_kwargs(self):
        return {
            'average_window': self.average_window,
            'max_average_window': self.max_average_window,
            'do_average_in_cpu': self.do_average_in_cpu
        }

Q
qijun 已提交
327 328
    def __init__(self,
                 average_window,
Z
zhangjinchao01 已提交
329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
                 max_average_window=None,
                 do_average_in_cpu=False):
        self.average_window = average_window
        self.max_average_window = max_average_window
        self.do_average_in_cpu = do_average_in_cpu


class GradientClippingThreshold(Optimizer):
    def extra_settings(self):
        default_gradient_clipping_threshold(self.threshold)

    def __init__(self, threshold):
        self.threshold = threshold

    def to_setting_kwargs(self):
        return dict()


def __extends__(dict1, dict2):
    for key in dict2:
        assert key not in dict1
        dict1[key] = dict2[key]
    return dict1


Q
qijun 已提交
354 355 356 357
@wrap_param_default(
    ['learning_method'], default_factory=lambda _: MomentumOptimizer())
@wrap_param_default(
    ['regularization'], default_factory=lambda _: BaseRegularization())
Z
zhangjinchao01 已提交
358 359
def settings(batch_size,
             learning_rate=1e-3,
E
emailweixu 已提交
360 361 362 363
             learning_rate_decay_a=0.,
             learning_rate_decay_b=0.,
             learning_rate_schedule='poly',
             learning_rate_args='',
Z
zhangjinchao01 已提交
364 365 366 367
             learning_method=None,
             regularization=None,
             is_async=False,
             model_average=None,
Q
qijun 已提交
368
             gradient_clipping_threshold=None):
Z
zhangjinchao01 已提交
369
    """
Y
yuyang18 已提交
370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
    Set the optimization method, learning rate, batch size, and other training
    settings. The currently supported algorithms are SGD and Async-SGD.

    ..  warning::

        Note that the 'batch_size' in PaddlePaddle is not equal to global
        training batch size. It represents the single training process's batch
        size. If you use N processes to train one model, for example use three
        GPU machines, the global batch size is N*'batch_size'.

    :param batch_size: batch size for one training process.
    :type batch_size: int
    :param learning_rate: learning rate for SGD
    :type learning_rate: float
    :param learning_method: The extension optimization algorithms of gradient
                            descent, such as momentum, adagrad, rmsprop, etc.
                            Note that it should be instance with base type
                            BaseSGDOptimizer.
    :type learning_method: BaseSGDOptimizer
    :param regularization: The regularization method.
    :type regularization: BaseRegularization
    :param is_async: Is Async-SGD or not. Default value is False.
    :type is_async: bool
    :param model_average: Model Average Settings.
    :type model_average: ModelAverage
    :param gradient_clipping_threshold: gradient clipping threshold. If gradient
                                        value larger than some value, will be
                                        clipped.
    :type gradient_clipping_threshold: float
Z
zhangjinchao01 已提交
399 400 401 402 403 404 405 406 407 408
    """
    if isinstance(regularization, BaseRegularization):
        regularization = [regularization]

    assert isinstance(learning_method, Optimizer)
    if isinstance(learning_method, BaseSGDOptimizer):
        algorithm = 'async_sgd' if is_async else 'sgd'
    else:
        algorithm = 'owlqn'

Q
qijun 已提交
409 410
    args = [
        'batch_size', 'learning_rate', 'learning_rate_decay_a',
P
Peng Li 已提交
411
        'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args'
Q
qijun 已提交
412
    ]
Z
zhangjinchao01 已提交
413 414
    kwargs = dict()
    kwargs['algorithm'] = algorithm
E
emailweixu 已提交
415 416
    for arg in args:
        kwargs[arg] = locals()[arg]
Z
zhangjinchao01 已提交
417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441

    kwargs = __extends__(kwargs, learning_method.to_setting_kwargs())
    learning_method.extra_settings()

    for regular in regularization:
        assert isinstance(regular, BaseRegularization)
        regular.algorithm = algorithm
        regular.learning_method = kwargs['learning_method']
        kwargs = __extends__(kwargs, regular.to_setting_kwargs())
        regular.extra_settings()

    if gradient_clipping_threshold is not None:
        gradient_clipping_threshold = GradientClippingThreshold(
            threshold=gradient_clipping_threshold)

    for each in [model_average, gradient_clipping_threshold]:
        if each is not None:
            assert isinstance(each, Optimizer)
            each.algorithm = algorithm
            each.learning_method = kwargs['learning_method']
            kwargs = __extends__(kwargs, each.to_setting_kwargs())
            each.extra_settings()

    # Do Check?
    Settings(**kwargs)