未验证 提交 f66d08c2 编写于 作者: X Xin Pan 提交者: GitHub

Merge pull request #13493 from panyx0718/doc

convert **kwargs to explicit arguments
...@@ -136,10 +136,6 @@ def parse_args(): ...@@ -136,10 +136,6 @@ def parse_args():
'--no_random', '--no_random',
action='store_true', action='store_true',
help='If set, keep the random seed and do not shuffle the data.') help='If set, keep the random seed and do not shuffle the data.')
parser.add_argument(
'--use_lars',
action='store_true',
help='If set, use lars for optimizers, ONLY support resnet module.')
parser.add_argument( parser.add_argument(
'--reduce_strategy', '--reduce_strategy',
type=str, type=str,
......
...@@ -200,11 +200,6 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -200,11 +200,6 @@ def get_model(args, is_train, main_prog, startup_prog):
# configure optimize # configure optimize
optimizer = None optimizer = None
if is_train: if is_train:
if args.use_lars:
lars_decay = 1.0
else:
lars_decay = 0.0
total_images = 1281167 / trainer_count total_images = 1281167 / trainer_count
step = int(total_images / (args.batch_size * args.gpus) + 1) step = int(total_images / (args.batch_size * args.gpus) + 1)
......
...@@ -224,11 +224,6 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -224,11 +224,6 @@ def get_model(args, is_train, main_prog, startup_prog):
# configure optimize # configure optimize
optimizer = None optimizer = None
if is_train: if is_train:
if args.use_lars:
lars_decay = 1.0
else:
lars_decay = 0.0
total_images = 1281167 / trainer_count total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1) step = int(total_images / args.batch_size + 1)
......
...@@ -244,11 +244,6 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -244,11 +244,6 @@ def get_model(args, is_train, main_prog, startup_prog):
optimizer = None optimizer = None
if is_train: if is_train:
if args.use_lars:
lars_decay = 1.0
else:
lars_decay = 0.0
total_images = 1281167 / trainer_count total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1) step = int(total_images / args.batch_size + 1)
...@@ -262,8 +257,7 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -262,8 +257,7 @@ def get_model(args, is_train, main_prog, startup_prog):
learning_rate=fluid.layers.piecewise_decay( learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr), boundaries=bd, values=lr),
momentum=0.9, momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4), regularization=fluid.regularizer.L2Decay(1e-4))
LARS_weight_decay=lars_decay)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
if args.memory_optimize: if args.memory_optimize:
......
...@@ -350,25 +350,25 @@ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'fi ...@@ -350,25 +350,25 @@ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'fi
paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate'], varargs=None, keywords='kwargs', defaults=None) paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov'], varargs=None, keywords='kwargs', defaults=(False,)) paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon'], varargs=None, keywords='kwargs', defaults=(1e-06,)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08)) paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06)) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5)) paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None))
paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False)) paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95)) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window'], varargs=None, keywords='kwargs', defaults=(10000, 10000)) paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
......
...@@ -43,11 +43,7 @@ class Optimizer(object): ...@@ -43,11 +43,7 @@ class Optimizer(object):
but need to use one of it's implementation. but need to use one of it's implementation.
""" """
def __init__(self, def __init__(self, learning_rate, regularization=None, name=None):
learning_rate,
regularization=None,
LARS_weight_decay=0.0,
name=None):
if not isinstance(learning_rate, float) and \ if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, framework.Variable): not isinstance(learning_rate, framework.Variable):
raise TypeError("learning rate should be float or Variable") raise TypeError("learning rate should be float or Variable")
...@@ -68,7 +64,6 @@ class Optimizer(object): ...@@ -68,7 +64,6 @@ class Optimizer(object):
# {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
self._accumulators = defaultdict(lambda: dict()) self._accumulators = defaultdict(lambda: dict())
self.helper = None self.helper = None
self._LARS_weight_decay = LARS_weight_decay
def _create_global_learning_rate(self): def _create_global_learning_rate(self):
lr = self._global_learning_rate() lr = self._global_learning_rate()
...@@ -109,7 +104,6 @@ class Optimizer(object): ...@@ -109,7 +104,6 @@ class Optimizer(object):
param = param_and_grad[0] param = param_and_grad[0]
param_lr = param.optimize_attr['learning_rate'] param_lr = param.optimize_attr['learning_rate']
if type(param_lr) == Variable: if type(param_lr) == Variable:
# param learning rate has been updated (LARS)
print("returns updated param lr ", param_lr) print("returns updated param lr ", param_lr)
return param_lr return param_lr
else: else:
...@@ -227,10 +221,6 @@ class Optimizer(object): ...@@ -227,10 +221,6 @@ class Optimizer(object):
self._create_accumulators(loss.block, self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads]) [p[0] for p in parameters_and_grads])
self._create_global_learning_rate() self._create_global_learning_rate()
if self._LARS_weight_decay > 0.0:
layers.append_LARS(parameters_and_grads,
self._global_learning_rate(),
self._LARS_weight_decay)
optimize_ops = [] optimize_ops = []
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
...@@ -287,6 +277,9 @@ class SGDOptimizer(Optimizer): ...@@ -287,6 +277,9 @@ class SGDOptimizer(Optimizer):
Args: Args:
learning_rate (float|Variable): the learning rate used to update parameters. \ learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element. Can be a float value or a Variable with one float value as data element.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -295,10 +288,12 @@ class SGDOptimizer(Optimizer): ...@@ -295,10 +288,12 @@ class SGDOptimizer(Optimizer):
sgd_optimizer.minimize(cost) sgd_optimizer.minimize(cost)
""" """
def __init__(self, learning_rate, **kwargs): def __init__(self, learning_rate, regularization=None, name=None):
assert learning_rate is not None assert learning_rate is not None
super(SGDOptimizer, self).__init__( super(SGDOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "sgd" self.type = "sgd"
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
...@@ -343,6 +338,9 @@ class MomentumOptimizer(Optimizer): ...@@ -343,6 +338,9 @@ class MomentumOptimizer(Optimizer):
Can be a float value or a Variable with one float value as data element. Can be a float value or a Variable with one float value as data element.
momentum (float): momentum factor momentum (float): momentum factor
use_nesterov (bool): enables Nesterov momentum use_nesterov (bool): enables Nesterov momentum
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -352,11 +350,18 @@ class MomentumOptimizer(Optimizer): ...@@ -352,11 +350,18 @@ class MomentumOptimizer(Optimizer):
""" """
_velocity_acc_str = "velocity" _velocity_acc_str = "velocity"
def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs): def __init__(self,
learning_rate,
momentum,
use_nesterov=False,
regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert momentum is not None assert momentum is not None
super(MomentumOptimizer, self).__init__( super(MomentumOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "momentum" self.type = "momentum"
self._momentum = momentum self._momentum = momentum
self._use_nesterov = bool(use_nesterov) self._use_nesterov = bool(use_nesterov)
...@@ -412,6 +417,9 @@ class AdagradOptimizer(Optimizer): ...@@ -412,6 +417,9 @@ class AdagradOptimizer(Optimizer):
learning_rate (float|Variable): the learning rate used to update parameters. \ learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element. Can be a float value or a Variable with one float value as data element.
epsilon (float): a small float value for numerical stability. epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -421,11 +429,17 @@ class AdagradOptimizer(Optimizer): ...@@ -421,11 +429,17 @@ class AdagradOptimizer(Optimizer):
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs): def __init__(self,
learning_rate,
epsilon=1.0e-6,
regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert epsilon is not None assert epsilon is not None
super(AdagradOptimizer, self).__init__( super(AdagradOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "adagrad" self.type = "adagrad"
self._epsilon = epsilon self._epsilon = epsilon
...@@ -485,6 +499,9 @@ class AdamOptimizer(Optimizer): ...@@ -485,6 +499,9 @@ class AdamOptimizer(Optimizer):
beta1 (float): The exponential decay rate for the 1st moment estimates. beta1 (float): The exponential decay rate for the 1st moment estimates.
beta2 (float): The exponential decay rate for the 2nd moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates.
epsilon (float): a small float value for numerical stability. epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -503,13 +520,16 @@ class AdamOptimizer(Optimizer): ...@@ -503,13 +520,16 @@ class AdamOptimizer(Optimizer):
beta1=0.9, beta1=0.9,
beta2=0.999, beta2=0.999,
epsilon=1e-8, epsilon=1e-8,
**kwargs): regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
assert epsilon is not None assert epsilon is not None
super(AdamOptimizer, self).__init__( super(AdamOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "adam" self.type = "adam"
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
...@@ -629,6 +649,9 @@ class AdamaxOptimizer(Optimizer): ...@@ -629,6 +649,9 @@ class AdamaxOptimizer(Optimizer):
beta1 (float): The exponential decay rate for the 1st moment estimates. beta1 (float): The exponential decay rate for the 1st moment estimates.
beta2 (float): The exponential decay rate for the 2nd moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates.
epsilon (float): a small float value for numerical stability. epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -645,13 +668,16 @@ class AdamaxOptimizer(Optimizer): ...@@ -645,13 +668,16 @@ class AdamaxOptimizer(Optimizer):
beta1=0.9, beta1=0.9,
beta2=0.999, beta2=0.999,
epsilon=1e-8, epsilon=1e-8,
**kwargs): regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
assert epsilon is not None assert epsilon is not None
super(AdamaxOptimizer, self).__init__( super(AdamaxOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "adamax" self.type = "adamax"
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
...@@ -742,6 +768,9 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -742,6 +768,9 @@ class DecayedAdagradOptimizer(Optimizer):
Can be a float value or a Variable with one float value as data element. Can be a float value or a Variable with one float value as data element.
decay (float): decay rate. decay (float): decay rate.
epsilon (float): a small float value for numerical stability. epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -751,13 +780,20 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -751,13 +780,20 @@ class DecayedAdagradOptimizer(Optimizer):
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
def __init__(self, learning_rate, decay=0.95, epsilon=1.0e-6, **kwargs): def __init__(self,
learning_rate,
decay=0.95,
epsilon=1.0e-6,
regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert decay is not None assert decay is not None
assert epsilon is not None assert epsilon is not None
super(DecayedAdagradOptimizer, self).__init__( super(DecayedAdagradOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "decayed_adagrad" self.type = "decayed_adagrad"
self._decay = decay self._decay = decay
self._epsilon = epsilon self._epsilon = epsilon
...@@ -811,6 +847,9 @@ class AdadeltaOptimizer(Optimizer): ...@@ -811,6 +847,9 @@ class AdadeltaOptimizer(Optimizer):
learning_rate(float): global learning rate learning_rate(float): global learning rate
rho(float): rho in equation rho(float): rho in equation
epsilon(float): epsilon in equation epsilon(float): epsilon in equation
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -823,7 +862,12 @@ class AdadeltaOptimizer(Optimizer): ...@@ -823,7 +862,12 @@ class AdadeltaOptimizer(Optimizer):
_avg_squared_grad_acc_str = "_avg_squared_grad" _avg_squared_grad_acc_str = "_avg_squared_grad"
_avg_squared_update_acc_str = "_avg_squared_update" _avg_squared_update_acc_str = "_avg_squared_update"
def __init__(self, learning_rate, epsilon=1.0e-6, rho=0.95, **kwargs): def __init__(self,
learning_rate,
epsilon=1.0e-6,
rho=0.95,
regularization=None,
name=None):
if learning_rate is None: if learning_rate is None:
raise ValueError("learning_rate is not set.") raise ValueError("learning_rate is not set.")
if epsilon is None: if epsilon is None:
...@@ -831,7 +875,9 @@ class AdadeltaOptimizer(Optimizer): ...@@ -831,7 +875,9 @@ class AdadeltaOptimizer(Optimizer):
if rho is None: if rho is None:
raise ValueError("rho is not set.") raise ValueError("rho is not set.")
super(AdadeltaOptimizer, self).__init__( super(AdadeltaOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "adadelta" self.type = "adadelta"
self._epsilon = epsilon self._epsilon = epsilon
self._rho = rho self._rho = rho
...@@ -932,6 +978,9 @@ class RMSPropOptimizer(Optimizer): ...@@ -932,6 +978,9 @@ class RMSPropOptimizer(Optimizer):
the gradient; if False, by the uncentered second moment. Setting this to the gradient; if False, by the uncentered second moment. Setting this to
True may help with training, but is slightly more expensive in terms of True may help with training, but is slightly more expensive in terms of
computation and memory. Defaults to False. computation and memory. Defaults to False.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Raises: Raises:
ValueError: If learning_rate, rho, epsilon, momentum are None. ValueError: If learning_rate, rho, epsilon, momentum are None.
...@@ -953,9 +1002,12 @@ class RMSPropOptimizer(Optimizer): ...@@ -953,9 +1002,12 @@ class RMSPropOptimizer(Optimizer):
epsilon=1.0e-6, epsilon=1.0e-6,
momentum=0.0, momentum=0.0,
centered=False, centered=False,
**kwargs): regularization=None,
name=None):
super(RMSPropOptimizer, self).__init__( super(RMSPropOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
if learning_rate is None: if learning_rate is None:
raise ValueError("learning_rate is not set.") raise ValueError("learning_rate is not set.")
if rho is None: if rho is None:
...@@ -1061,6 +1113,9 @@ class FtrlOptimizer(Optimizer): ...@@ -1061,6 +1113,9 @@ class FtrlOptimizer(Optimizer):
l1 (float): l1 (float):
l2 (float): l2 (float):
lr_power (float): lr_power (float):
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Raises: Raises:
ValueError: If learning_rate, rho, epsilon, momentum are None. ValueError: If learning_rate, rho, epsilon, momentum are None.
...@@ -1075,9 +1130,17 @@ class FtrlOptimizer(Optimizer): ...@@ -1075,9 +1130,17 @@ class FtrlOptimizer(Optimizer):
_squared_acc_str = "squared" _squared_acc_str = "squared"
_linear_acc_str = "linear" _linear_acc_str = "linear"
def __init__(self, learning_rate, l1=0.0, l2=0.0, lr_power=-0.5, **kwargs): def __init__(self,
learning_rate,
l1=0.0,
l2=0.0,
lr_power=-0.5,
regularization=None,
name=None):
super(FtrlOptimizer, self).__init__( super(FtrlOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
if learning_rate is None: if learning_rate is None:
raise ValueError("learning_rate is not set.") raise ValueError("learning_rate is not set.")
...@@ -1155,7 +1218,9 @@ class ModelAverage(Optimizer): ...@@ -1155,7 +1218,9 @@ class ModelAverage(Optimizer):
average_window_rate: The rate of average window. average_window_rate: The rate of average window.
min_average_window: The minimum size of average window. min_average_window: The minimum size of average window.
max_average_window: The maximum size of average window. max_average_window: The maximum size of average window.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1178,8 +1243,10 @@ class ModelAverage(Optimizer): ...@@ -1178,8 +1243,10 @@ class ModelAverage(Optimizer):
average_window_rate, average_window_rate,
min_average_window=10000, min_average_window=10000,
max_average_window=10000, max_average_window=10000,
**kwargs): regularization=None,
super(ModelAverage, self).__init__(0.0, **kwargs) name=None):
super(ModelAverage, self).__init__(
0.0, regularization=regularization, name=name)
self.average_window = average_window_rate self.average_window = average_window_rate
self.min_average_window = min_average_window self.min_average_window = min_average_window
self.max_average_window = max_average_window self.max_average_window = max_average_window
......
...@@ -190,14 +190,11 @@ class L1DecayRegularizer(WeightDecayRegularizer): ...@@ -190,14 +190,11 @@ class L1DecayRegularizer(WeightDecayRegularizer):
Examples: Examples:
.. code-block:: python .. code-block:: python
program = fluid.framework.Program() optimizer = fluid.optimizer.Adagrad(
block = program.global_block() learning_rate=1e-4,
mul_x = block.create_parameter( regularization=fluid.regularizer.L1DecayRegularizer(
dtype="float32", regularization_coeff=0.1))
shape=[5, 10], optimizer.minimize(avg_cost)
lod_level=0,
name="mul.x",
regularizer=fluid.regularizer.L1DecayRegularizer(0.5))
""" """
def __init__(self, regularization_coeff=0.0): def __init__(self, regularization_coeff=0.0):
......
...@@ -99,7 +99,7 @@ def train(nn_type, ...@@ -99,7 +99,7 @@ def train(nn_type,
test_program = fluid.default_main_program().clone(for_test=True) test_program = fluid.default_main_program().clone(for_test=True)
optimizer = fluid.optimizer.Adam(learning_rate=0.001, LARS_weight_decay=0.3) optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册