diff --git a/doc_cn/demo/quick_start/index.md b/doc_cn/demo/quick_start/index.md index e799e454f21432c0bd2b121152c137e5be7d1e8a..d6aa6625650861d97b965cc39e24f31c5b83051d 100644 --- a/doc_cn/demo/quick_start/index.md +++ b/doc_cn/demo/quick_start/index.md @@ -4,7 +4,7 @@ ## 安装(Install) -首先请参考安装教程安装PaddlePaddle。 +首先请参考安装教程安装PaddlePaddle。 ## 使用概述(Overview) diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py index f0e51c3de535e8830dae2ba1ac7b9d7caa53e118..ed676ac2152a6c08b90c18480ef0c69d5c0779f8 100644 --- a/python/paddle/trainer_config_helpers/optimizers.py +++ b/python/paddle/trainer_config_helpers/optimizers.py @@ -64,14 +64,6 @@ class BaseSGDOptimizer(Optimizer): w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w) where :math:`\\eta` is learning rate. And :math:`n` is batch size. - - The SGD method is implemented by paddle with multiple extensions. Such as - momentum, adagrad, rmsprop, adam. Please use method 'use_xxx', such as - use_adam, to enhance the SGD method. - - WARNING: IN PADDLE'S IMPLEMENTATION, BATCH_SIZE IS SET FOR ONE COMPUTE - PROCESS(NODE). IF YOU USE MULTIPLE MACHINE TO TRAIN YOUR NETWORK, THE GLOBAL - BATCH SIZE WILL BE (BATCH_SIZE * MACHINE_COUNT). """ def to_setting_kwargs(self): @@ -352,17 +344,35 @@ def settings(batch_size, gradient_clipping_threshold=None ): """ - TODO(yuyang18): Complete docs. - - - :param batch_size: - :param learning_rate: - :param learning_method: - :param regularization: - :param is_async: - :param model_average: - :param gradient_clipping_threshold: - :return: + Set the optimization method, learning rate, batch size, and other training + settings. The currently supported algorithms are SGD and Async-SGD. + + .. warning:: + + Note that the 'batch_size' in PaddlePaddle is not equal to global + training batch size. It represents the single training process's batch + size. If you use N processes to train one model, for example use three + GPU machines, the global batch size is N*'batch_size'. + + :param batch_size: batch size for one training process. + :type batch_size: int + :param learning_rate: learning rate for SGD + :type learning_rate: float + :param learning_method: The extension optimization algorithms of gradient + descent, such as momentum, adagrad, rmsprop, etc. + Note that it should be instance with base type + BaseSGDOptimizer. + :type learning_method: BaseSGDOptimizer + :param regularization: The regularization method. + :type regularization: BaseRegularization + :param is_async: Is Async-SGD or not. Default value is False. + :type is_async: bool + :param model_average: Model Average Settings. + :type model_average: ModelAverage + :param gradient_clipping_threshold: gradient clipping threshold. If gradient + value larger than some value, will be + clipped. + :type gradient_clipping_threshold: float """ if isinstance(regularization, BaseRegularization): regularization = [regularization]