未验证 提交 723c6f77 编写于 作者: L LoneRanger 提交者: GitHub

remove the...

remove the AdamOptimizer、SGDOptimizer、MomentumOptimizer、ModelAverage、LookaheadOptimizer、FtrlOptimizer、DecayedAdagradOptimizer、DpsgdOptimizer in fluid and relocate the ExponentialMovingAverage、PipelineOptimizer、GradientMergeOptimizer and change optimizer base for LarsMomentumOptimizer and RecomputeOptimizer (#55970)

* change the optimizer base for SGDOptimizer

* change the optimizer base for SGDOptimizer

* replace the SGDOptimizer with SGD

* fix bug of sgd

* change the optimizer base for MomentumOptimizer

* fix the remaining tests

* remove the Momentum in fluid/optimizer.py

* fix bug

* fix bug

* fix bug

* fix bug

* Update test_resnet_cinn.py

* Update test_resnet_prim_cinn.py

* fix bug

* fix bug

* fix bug

* remove the ModelAverage in fluid

* remove the LookaheadOptimizer in fluid

* fix bug

* remove AdamOptimizer in fluid

* Update test_image_classification_fp16.py

* fix bug

* relocate the ExponentialMovingAverage in fluid

* restore the static api

* remove the FtrlOptimizer in fluid

* remove the DecayedAdagradOptimizer in fluid

* remove the DpsgdOptimizer in fluid

* fix bug

* fix codestyle

* fix bug

* fix bug

* relocate the PipelineOptimizer

* relocate the GradientMergeOptimizer

* fix bug

* fix bug

* fix bug

* fix doc

* Update __init__.py

* Update test_fleet_qat_meta_optimizer.py

* change optimizer base for LarsMomentumOptimizer

* fix bug

* fix conflict

* fix code-style

* fix sample codes

* fix bug

* fix bug

* fix cinn bug

* fix bug

* fix bug

* Update qat_optimizer.py

* Update __init__.py

* fix bug

* change optimizer base for RecomputeOptimizer

* fix bug

* fix bug

* Update test_imperative_optimizer_v2.py
上级 9ec0bdd5
......@@ -75,8 +75,8 @@ class AmpScaler:
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
model = paddle.nn.Conv2D(3, 2, 3)
optimizer = paddle.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters())
optimizer = paddle.optimizer.SGD(
learning_rate=0.01, parameters=model.parameters())
scaler = paddle.amp.AmpScaler(init_loss_scaling=1024)
data = paddle.to_tensor(data)
with paddle.amp.amp_guard():
......@@ -168,8 +168,8 @@ class AmpScaler:
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
model = paddle.nn.Conv2D(3, 2, 3)
optimizer = paddle.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters())
optimizer = paddle.optimizer.SGD(
learning_rate=0.01, parameters=model.parameters())
scaler = paddle.amp.AmpScaler(init_loss_scaling=1024)
data = paddle.to_tensor(data)
with paddle.amp.amp_guard():
......@@ -221,8 +221,8 @@ class AmpScaler:
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
model = paddle.nn.Conv2D(3, 2, 3)
optimizer = paddle.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters())
optimizer = paddle.optimizer.SGD(
learning_rate=0.01, parameters=model.parameters())
scaler = paddle.amp.AmpScaler(init_loss_scaling=1024)
data = paddle.to_tensor(data)
with paddle.amp.amp_guard():
......
......@@ -21,9 +21,10 @@ __all__ = []
import paddle
from paddle.common_ops_import import LayerHelper
from paddle.fluid.dygraph import base as imperative_base
from paddle.fluid.optimizer import Momentum, Optimizer
from paddle.fluid.optimizer import Optimizer
from paddle.framework import core, in_dynamic_mode
from paddle.nn.clip import ClipGradByNorm, append_gradient_clip_ops
from paddle.optimizer import Momentum
from paddle.regularizer import L1Decay, L2Decay
from paddle.static import create_global_var
......
......@@ -152,6 +152,6 @@ class FP16AllReduceOptimizer(MetaOptimizerBase):
def apply_optimize(self, loss, startup_program, params_grads):
new_params_grads = self.fp16_compression(params_grads)
return self.inner_opt.apply_optimize(
return self.inner_opt._apply_optimize(
loss, startup_program=startup_program, params_grads=new_params_grads
)
......@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from paddle.fluid.optimizer import GradientMergeOptimizer as GM
from paddle.incubate.optimizer import GradientMergeOptimizer as GM
from .meta_optimizer_base import MetaOptimizerBase
......
......@@ -14,7 +14,7 @@
import logging
import paddle
from paddle.fluid.optimizer import AdamOptimizer
from paddle.optimizer import Adam
from .meta_optimizer_base import MetaOptimizerBase
......@@ -38,7 +38,7 @@ class LambOptimizer(MetaOptimizerBase):
)
opt = self.inner_opt
if not isinstance(opt, AdamOptimizer):
if not isinstance(opt, Adam):
return
configs = self.user_defined_strategy.lamb_configs
......@@ -72,7 +72,7 @@ class LambOptimizer(MetaOptimizerBase):
return False
if self.user_defined_strategy.lamb:
if not isinstance(self.inner_opt, AdamOptimizer):
if not isinstance(self.inner_opt, Adam):
logging.warn(
"lamb need the inner optimizer to be AdamOptimizer optimizer but got {}.".format(
self.inner_opt.type
......
......@@ -13,7 +13,8 @@
import logging
from paddle.fluid.optimizer import LarsMomentumOptimizer, Momentum
from paddle.incubate.optimizer import LarsMomentumOptimizer
from paddle.optimizer import Momentum
from .meta_optimizer_base import MetaOptimizerBase
......@@ -98,7 +99,7 @@ class LarsOptimizer(MetaOptimizerBase):
return self.lars_opt.apply_gradients(params_grads=params_grads)
def apply_optimize(self, loss, startup_program, params_grads):
return self.lars_opt.apply_optimize(
return self.lars_opt._apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads
)
......
......@@ -49,9 +49,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
self.inner_opt,
(
paddle.optimizer.momentum.Momentum,
paddle.fluid.optimizer.Momentum,
paddle.optimizer.sgd.SGD,
paddle.fluid.optimizer.SGD,
),
)
......@@ -235,9 +233,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
self.inner_opt,
(
paddle.optimizer.Momentum,
paddle.fluid.optimizer.Momentum,
paddle.optimizer.sgd.SGD,
paddle.fluid.optimizer.SGD,
),
)
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid.optimizer import Optimizer
from paddle.optimizer import Optimizer
__all__ = []
......@@ -81,7 +81,7 @@ class MetaOptimizerBase(Optimizer):
)
def apply_optimize(self, loss, startup_program, params_grads):
return self.inner_opt.apply_optimize(
return self.inner_opt._apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads
)
......
......@@ -17,7 +17,6 @@ import re
import subprocess
import paddle
from paddle import fluid
from paddle.framework import core
from ..base.private_helper_function import wait_server_ready
......@@ -293,7 +292,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
% (platform.system())
)
if not isinstance(self.inner_opt, fluid.optimizer.SGDOptimizer):
if not isinstance(self.inner_opt, paddle.optimizer.SGD):
return False
free = get_sys_free_mem()
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
import paddle
from paddle.fluid.optimizer import PipelineOptimizer as PO
from paddle.incubate.optimizer import PipelineOptimizer as PO
from .common import (
OP_ROLE_KEY,
......
......@@ -204,7 +204,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
% (platform.system())
)
if not isinstance(self.inner_opt, paddle.fluid.optimizer.SGDOptimizer):
if not isinstance(self.inner_opt, paddle.optimizer.SGD):
return False
free = get_sys_free_mem()
......
......@@ -96,9 +96,9 @@ class QATOptimizer(MetaOptimizerBase):
):
optimize_ops, params_grads = self.inner_opt.minimize(
loss,
startup_program=startup_program,
parameter_list=parameter_list,
no_grad_set=no_grad_set,
startup_program,
parameter_list,
no_grad_set,
)
device = paddle.device.get_device()
place = paddle.set_device(device)
......
......@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from paddle.fluid.optimizer import RecomputeOptimizer as RO
from paddle.incubate.optimizer import RecomputeOptimizer as RO
from .meta_optimizer_base import MetaOptimizerBase
......
......@@ -15,7 +15,7 @@
import os
from paddle.fluid import core
from paddle.fluid.optimizer import PipelineOptimizer
from paddle.incubate.optimizer import PipelineOptimizer
from paddle.static import (
create_global_var,
default_startup_program,
......
......@@ -917,7 +917,7 @@ class FP16Pass(AMPPass):
if self.target_dtype == "fp16":
if isinstance(
base_opt, (paddle.static.Adam, paddle.optimizer.AdamW)
base_opt, (paddle.optimizer.Adam, paddle.optimizer.AdamW)
):
with main_program._optimized_guard([]):
# found_inf = paddle.tensor.creation._memcpy(
......
......@@ -295,7 +295,7 @@ class DistributeTranspiler:
cost =paddle.nn.functional.square_error_cost(input=y_predict, label=y)
avg_loss = paddle.mean(cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_loss)
# for pserver mode
......
此差异已折叠。
......@@ -17,8 +17,7 @@ import abc
from paddle import fluid
from paddle.distributed.fleet.base.role_maker import RoleMakerBase
from paddle.fluid.executor import Executor
from paddle.fluid.optimizer import SGD
from paddle.optimizer import SGD as SGD_v2
from paddle.optimizer import SGD
from paddle.static.amp.decorator import OptimizerWithMixedPrecision
__all__ = []
......@@ -293,8 +292,8 @@ class DistributedOptimizer(metaclass=abc.ABCMeta):
def __init__(self, optimizer, strategy=None):
if (
not isinstance(optimizer, SGD.__bases__)
and not isinstance(optimizer, fluid.optimizer.Optimizer)
and not isinstance(optimizer, OptimizerWithMixedPrecision)
and not isinstance(optimizer, SGD_v2.__base__)
):
raise TypeError("optimizer must be an instance of Optimizer")
......
......@@ -533,7 +533,7 @@ class CollectiveOptimizer(DistributedOptimizer):
"forward_recompute", self._optimizer.__class__.__name__
)
self._optimizer = fluid.optimizer.RecomputeOptimizer(
self._optimizer = paddle.incubate.optimizer.RecomputeOptimizer(
self._optimizer
)
self._optimizer._set_checkpoints(self._recompute_checkpoints)
......
......@@ -367,7 +367,7 @@ class FleetTranspiler(Fleet):
TranspilerOptimizer: subclass of DistributedOptimizer.
"""
if not isinstance(optimizer, Optimizer):
if not isinstance(optimizer, paddle.optimizer.Optimizer):
raise ValueError("optimizer must be an instance of Optimizer")
if not self._is_initialized:
raise ValueError(
......
......@@ -14,6 +14,10 @@
from .lookahead import LookAhead # noqa: F401
from .modelaverage import ModelAverage # noqa: F401
from .lars_momentum import LarsMomentumOptimizer # noqa: F401
from .recompute import RecomputeOptimizer # noqa: F401
from .pipeline import PipelineOptimizer # noqa: F401
from .gradient_merge import GradientMergeOptimizer # noqa: F401
from .distributed_fused_lamb import DistributedFusedLamb # noqa: F401
from .lbfgs import LBFGS # noqa: F401
from . import functional # noqa: F401
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle.fluid import core
from paddle.fluid.framework import (
Variable,
default_main_program,
default_startup_program,
device_guard,
in_dygraph_mode,
program_guard,
)
__all__ = []
class GradientMergeOptimizer:
"""
Gradient Merge, also called as Gradient Accumulation,
is a training strategy for larger batches. With this strategy,
the parameter will not be updated until specific steps.
For each step, the forward network and the backward network
will run to calculate the gradient of the parameters.
For every k step, the optimization network will run,
applying a specific optimization method (such as SGD, Adam)
to the parameters.
Args:
inner_optimizer (Optimizer): The specific optimization (such as SGD, Adam)
which update the parameters
k_steps (int): the update period of the parameters
avg (bool): whether to average the gradients of each mini-batch,
the default value is `True`
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
import numpy as np
def gen_data(batch_size):
return {"x": np.random.random(size=(batch_size, 32)).astype('float32'),
"y": np.random.random(size=(batch_size, 1)).astype('int64')}
def mlp(input_x, input_y, hid_dim=128, label_dim=2):
fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
cost = paddle.nn.functional.cross_entropy(
input=prediction, label=input_y,
reduction='none', use_softmax=False
)
sum_cost = paddle.mean(cost)
return sum_cost, fc_1, prediction
input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
cost, fc_1, pred = mlp(input_x, input_y)
sgd = paddle.optimizer.Adam(learning_rate=0.01)
sgd = paddle.incubate.optimizer.GradientMergeOptimizer(sgd, k_steps=4, avg=True)
sgd.minimize(cost)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for i in range(10):
cost_val = exe.run(feed=gen_data(32),
program=fluid.default_main_program(),
fetch_list=[cost.name])
print("step=%d, cost=%f" % (i, cost_val[0]))
"""
GRAD_MERGE_COND_NAME = "grad_merge_cond_name"
def __init__(self, inner_optimizer, k_steps=1, avg=True):
if in_dygraph_mode():
raise Exception(
"In dygraph, we don't support GradientMergeOptimizer."
"You can do Gradient merge by yourself with k-times forward + backward, "
"and one-time optimizer.minimize()"
)
assert inner_optimizer is not None, "inner optimizer can not be None"
assert (
isinstance(k_steps, int) and k_steps > 0
), "k_steps should be a positive integer"
self.inner_optimizer = inner_optimizer
self.k_steps = k_steps
self.type = "gradient_merge"
self.avg = avg
self._optimize_ops = None
def _set_k_steps(self, k_steps):
self.k_steps = k_steps
def _set_avg(self, avg):
self.avg = avg
def backward(
self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None,
callbacks=None,
):
assert isinstance(loss, Variable), "The loss should be an Variable."
assert (
parameter_list is None
), "The parameter_list should be None when using GradientMergeOptimizer"
assert (
no_grad_set is None
), "The no_grad_set should be None when using GradientMergeOptimizer"
params_grads = self.inner_optimizer.backward(
loss, startup_program=startup_program
)
return params_grads
def apply_optimize(self, loss, startup_program, params_grads):
program = loss.block.program
with program_guard(program, startup_program):
optimize_ops = self.apply_gradients(params_grads)
return optimize_ops
def _is_the_backward_op(self, op):
op_maker = core.op_proto_and_checker_maker
backward = core.op_proto_and_checker_maker.OpRole.Backward
if op_maker.kOpRoleVarAttrName() in op.attr_names and int(
op.all_attrs()[op_maker.kOpRoleAttrName()]
) == int(backward):
return True
return False
def _remove_op_role_var(self, param, grad):
op_maker = core.op_proto_and_checker_maker
op = grad.op
assert self._is_the_backward_op(
op
), 'grad.op={} is not the backward op which produces the grad={}'.format(
op, grad.name
)
block = grad.block
var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()]
assert (
param.name in var_attr
), 'when using GradientMergeOptimizer, param={} must be in var_attr={}'.format(
param.name, var_attr
)
assert (
grad.name in var_attr
), 'when using GradientMergeOptimizer, grad={} must be in var_attr={}'.format(
param.name, var_attr
)
# remove (param, grad) from op_role_var
var_attr.remove(param.name)
var_attr.remove(grad.name)
if len(var_attr) > 1:
op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr)
else:
op._remove_attr(op_maker.kOpRoleVarAttrName())
def _add_gm_op_role_var(self, op, param, grad, cond):
grad.op = op
op_maker = core.op_proto_and_checker_maker
backward = op_maker.OpRole.Backward
# NOTE(wangxi). When distributed, we will insert grad_merge_all_reduce_op_handle
# in multi_devices_graph_pass, which will allreduce(grad) if cond is True, else
# do nothing.
# In this way, the gradient can be merged first, and then communicate when the
# condition is met, reducing the number of communications to increase the
# speed.
op._set_attr(self.GRAD_MERGE_COND_NAME, cond.name)
op._set_attr(op_maker.kOpRoleAttrName(), backward)
op._set_attr(op_maker.kOpRoleVarAttrName(), [param.name, grad.name])
def _get_gm_cond_var(self, main_block):
# Add const var
k_step_var = paddle.static.create_global_var(
name="gradient_merge_k",
shape=[1],
value=int(self.k_steps),
dtype='int32',
persistable=True,
force_cpu=True,
)
zero_var = paddle.static.create_global_var(
name="gradient_merge_zero",
shape=[1],
value=int(0),
dtype='int32',
persistable=True,
force_cpu=True,
)
# Add step var & cond var
step_var = paddle.static.create_global_var(
name="gradient_merge_step",
shape=[1],
value=int(0),
dtype='int32',
persistable=True,
force_cpu=True,
)
cond_var = main_block.create_var(
name="gradient_merge_cond", shape=[1], dtype='bool'
)
with device_guard("cpu"):
# step_var = (step_var + 1) % k_step
paddle.increment(x=step_var, value=1.0)
main_block.append_op(
type='elementwise_mod',
inputs={'X': step_var, 'Y': k_step_var},
outputs={'Out': step_var},
attrs={'axis': -1, 'use_mkldnn': False},
)
# cond_var = (step_var == 0)
main_block.append_op(
type='equal',
inputs={'X': step_var, 'Y': zero_var},
outputs={'Out': cond_var},
)
return cond_var
def apply_gradients(self, params_grads):
main_program = default_main_program()
startup_program = default_startup_program()
main_block = main_program.global_block()
startup_block = startup_program.global_block()
cond = self._get_gm_cond_var(main_block)
# TODO(mapingshuo) support sparse embedding
# step1: remove grad.op's op_role_var
for param, grad in params_grads:
assert (
param.type != core.VarDesc.VarType.SELECTED_ROWS
), "SELECTED_ROWS is not supported in GradientMergeOptimizer for now"
self._remove_op_role_var(param, grad)
param_to_grad = {k.name: v for (k, v) in params_grads}
param_names = param_to_grad.keys()
param_to_gradient_merge = {}
new_params_grads = []
# step2: create gradient_merge var and init with 0
# and update op_role_var
for param, grad in params_grads:
param_name = param.name
param_var = main_block.var(param_name)
assert param_var is not None
gradient_merge_var = main_block.create_var(
name=param_name + "@GRAD@GradientMerge",
shape=param_var.shape,
dtype=param_var.dtype,
persistable=True,
)
param_to_gradient_merge[param_name] = gradient_merge_var
startup_gradient_merge_var = startup_block.create_var(
name=param_name + "@GRAD@GradientMerge",
shape=param_var.shape,
dtype=param_var.dtype,
persistable=True,
)
startup_block.append_op(
type="fill_constant",
outputs={"Out": startup_gradient_merge_var},
attrs={
"shape": param_var.shape,
"dtype": param_var.dtype,
"value": float(0),
},
)
# grad_merge += grad
new_grad_op = main_block.append_op(
type="elementwise_add",
inputs={'X': grad, 'Y': gradient_merge_var},
outputs={'Out': gradient_merge_var},
attrs={'axis': -1, 'use_mkldnn': False},
)
self._add_gm_op_role_var(
new_grad_op, param, gradient_merge_var, cond
)
new_params_grads.append([param, gradient_merge_var])
def true_apply_gradient():
cur_block_idx = main_program.current_block_idx
cur_block = main_program.current_block()
# cur_block's forward_block & backward_block is itself
cur_block._set_forward_block_idx(cur_block_idx)
op_maker = core.op_proto_and_checker_maker
if self.avg:
for param, new_grad in new_params_grads:
# grad /= k_steps
cur_block.append_op(
type='scale',
inputs={'X': new_grad},
outputs={'Out': new_grad},
attrs={
'scale': 1.0 / self.k_steps,
'bias': 0.0,
'bias_after_scale': False,
},
)
new_grad.op._set_attr(
op_maker.kOpRoleAttrName(), op_maker.OpRole.Backward
)
for param, new_grad in new_params_grads:
# NOTE. regularization will append ops to grad.block,
# while new_grad's real block is global_block,
# but we want append regularization ops to cur_block,
# so we set new_grad.block = cur_block
new_grad.block = cur_block
self._optimize_ops = self.inner_optimizer.apply_gradients(
new_params_grads
)
# clear gradient_merge_vars
for param, new_grad in new_params_grads:
paddle.tensor.fill_constant(
shape=new_grad.shape,
dtype=new_grad.dtype,
value=0.0,
out=new_grad,
)
new_grad.op._set_attr(
op_maker.kOpRoleAttrName(), op_maker.OpRole.Optimize
)
# step3. apply gradient
paddle.static.nn.cond(cond, true_fn=true_apply_gradient, false_fn=None)
return self._optimize_ops
def minimize(
self, loss, startup_program=None, parameter_list=None, no_grad_set=None
):
assert isinstance(loss, Variable), "The loss should be an Variable."
params_grads = self.backward(
loss,
startup_program=startup_program,
parameter_list=parameter_list,
no_grad_set=no_grad_set,
)
optimize_ops = self.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads
)
return optimize_ops, params_grads
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from paddle import _legacy_C_ops
from paddle.fluid import framework
from paddle.fluid.framework import in_dygraph_mode
from paddle.optimizer import Optimizer
class LarsMomentumOptimizer(Optimizer):
r"""
Momentum optimizer with LARS support
The update equations are as follows:
.. math::
& local\_learning\_rate = learning\_rate * lars\_coeff * \\
\\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}
& velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param + epsilon)
& param = param - velocity
Parameters:
learning_rate (float|Variable): The learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element. \
momentum (float): momentum factor
lars_coeff (float): Defines how much we trust the layer to change its weights.
lars_weight_decay (float): Weight decay coefficient for decaying using LARS.
parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static graph mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
:ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information. \
For details, please refer to :ref:`api_guide_Name`. Default is None.
exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
multi_precision (bool, optional): Whether to use multi-precision during weight updating.
rescale_grad (float, optional): Multiply the gradient with `rescale_grad` \
before updating. Often choose to be `1.0/batch_size`.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
import numpy as np
paddle.enable_static()
np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
inp = paddle.static.data(
name="inp", shape=[2, 2], dtype='float32')
out = paddle.static.nn.fc(inp, size=3)
out = paddle.sum(out)
optimizer = fluid.optimizer.LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9)
optimizer.minimize(out)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
exe.run(
feed={"inp": np_inp},
fetch_list=[out.name])
"""
_velocity_acc_str = "velocity"
def __init__(
self,
learning_rate,
momentum,
lars_coeff=0.001,
lars_weight_decay=0.0005,
parameter_list=None,
regularization=None,
grad_clip=None,
name=None,
exclude_from_weight_decay=None,
epsilon=0,
multi_precision=False,
rescale_grad=1.0,
):
assert learning_rate is not None
assert momentum is not None
super().__init__(
learning_rate=learning_rate,
parameters=parameter_list,
weight_decay=regularization,
grad_clip=grad_clip,
name=name,
)
self.type = "lars_momentum"
self._momentum = momentum
self._lars_coeff = float(lars_coeff)
self._lars_weight_decay = float(lars_weight_decay)
self._epsilon = float(epsilon)
if exclude_from_weight_decay is None:
self._exclude_from_weight_decay = []
else:
self._exclude_from_weight_decay = exclude_from_weight_decay
self._multi_precision = multi_precision
self._rescale_grad = float(rescale_grad)
self._master_weights = {}
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
for p in parameters:
if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
master_p = self._create_master_weight(p)
self._add_accumulator(self._velocity_acc_str, master_p)
continue
if (
self._is_dtype_fp16_or_bf16(p.dtype)
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self._add_accumulator(self._velocity_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
_lars_weight_decay = self._lars_weight_decay
param_name = param_and_grad[0].name
if len(self._exclude_from_weight_decay) > 0:
for name in self._exclude_from_weight_decay:
if name in param_name:
_lars_weight_decay = 0.0
break
velocity_acc = self._get_accumulator_master(
self._velocity_acc_str, param_and_grad[0]
)
lr = self._create_param_lr(param_and_grad)
find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
param_and_grad[0].dtype
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
attrs = {
"mu": self._momentum,
"lars_coeff": self._lars_coeff,
"lars_weight_decay": [_lars_weight_decay],
"multi_precision": find_master,
"epsilon": self._epsilon,
"rescale_grad": self._rescale_grad,
}
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Velocity": velocity_acc,
"LearningRate": lr,
}
outputs = {"ParamOut": param_and_grad[0], "VelocityOut": velocity_acc}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
if in_dygraph_mode():
tmp, tmp2 = _legacy_C_ops.lars_momentum(
[param_and_grad[0]],
[param_and_grad[1]],
[velocity_acc],
[lr],
[param_and_grad[0]],
[velocity_acc],
"mu",
self._momentum,
"lars_coeff",
self._lars_coeff,
"lars_weight_decay",
[_lars_weight_decay],
"multi_precision",
find_master,
"epsilon",
self._epsilon,
"rescale_grad",
self._rescale_grad,
)
else:
# create the momentum optimize op
momentum_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return momentum_op
此差异已折叠。
此差异已折叠。
......@@ -350,6 +350,13 @@ class Adam(Optimizer):
"Beta1Pow": [beta1_pow_acc],
"Beta2Pow": [beta2_pow_acc],
}
# Pass found_inf to adam, to skip update for not only param, but also momentum and beta_pow
found_inf = self._get_auxiliary_var('found_inf')
if found_inf:
inputs['SkipUpdate'] = found_inf
outputs = {
"ParamOut": [param_and_grad[0]],
"Moment1Out": [moment1],
......
......@@ -19,6 +19,7 @@ from . import amp # noqa: F401
from . import nn # noqa: F401
from .nn.common import py_func # noqa: F401
from .nn.common import ExponentialMovingAverage # noqa: F401
from .io import save_inference_model # noqa: F401
from .io import load_inference_model # noqa: F401
......@@ -70,8 +71,6 @@ from ..fluid.framework import set_ipu_shard # noqa: F401
from .nn.control_flow import Print # noqa: F401
from ..fluid.param_attr import WeightNormParamAttr # noqa: F401
from ..fluid.optimizer import Optimizer # noqa: F401
from ..fluid.optimizer import Adam # noqa: F401
from ..fluid.optimizer import ExponentialMovingAverage # noqa: F401
from ..fluid.layers import exponential_decay # noqa: F401
from ..fluid.layers import learning_rate_scheduler # noqa: F401
......
......@@ -483,7 +483,7 @@ class OptimizerWithMixedPrecision:
real_optimizer = real_optimizer.inner_opt
if isinstance(
real_optimizer,
(paddle.fluid.optimizer.Adam, paddle.optimizer.AdamW),
(paddle.optimizer.Adam, paddle.optimizer.AdamW),
):
# NOTE(zhiqiu): Since found_inf needs to be on cpu in adam op, we
# copy it in advance to avoid multiple time copies.
......
......@@ -24,11 +24,20 @@ from paddle.common_ops_import import (
check_type,
check_variable_and_dtype,
)
from paddle.fluid import core
from paddle.fluid import core, layers, unique_name
from paddle.fluid.data_feeder import check_dtype
from paddle.fluid.framework import Variable, in_dygraph_mode, static_only
from paddle.fluid.framework import (
Program,
Variable,
default_main_program,
in_dygraph_mode,
name_scope,
program_guard,
static_only,
)
from paddle.fluid.layers.layer_function_generator import templatedoc
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
from paddle.nn.initializer import Constant, Normal
__all__ = []
......@@ -3999,3 +4008,259 @@ def sparse_embedding(
},
)
return tmp
class ExponentialMovingAverage:
r"""
Compute the moving average of parameters with exponential decay.
Given a parameter :math:`\\theta`, its exponential moving average (EMA)
will be
.. math::
\text{EMA}_0 & = 0
\text{EMA}_t & = \text{decay} * \text{EMA}_{t-1} + (1 - \text{decay}) * \theta_t
The average results calculated by **update()** method will be saved in
temporary variables which are created and maintained by the object, and can
be applied to parameters of current model by calling **apply()** method. And
the **restore()** method is used to restore the parameters.
**Bias correction**. All EMAs are initialized to :math:`0` and hence they will be
zero biased, which can be corrected by divided by a factor
:math:`(1 - \text{decay}^t)` , i.e., the actual EMAs applied to parameters
when calling **apply()** method would be
.. math::
\widehat{\text{EMA}}_t = \frac{\text{EMA}_t}{1 - \text{decay}^t}
**Decay rate scheduling**. A large decay rate very close to 1 would result
in that the averages move very slowly. And a better strategy is to set a
relative smaller decay rate in the very beginning. The argument **thres_steps**
allows users to pass a Variable to schedule the decay rate, in this case,
the actual decay rate becomes
.. math::
\min(\text{decay}, \frac{1 + \text{thres_steps}}{10 + \text{thres_steps}})
Usually **thres_steps** can be the global training steps.
Args:
decay (float, optional): The exponential decay rate, usually close to 1, such as 0.999, 0.9999, ... . Default 0.999.
thres_steps (Variable|None, optional): If not `None`, schedule the decay rate. Default None.
name (str|None, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
Examples:
.. code-block:: python
import numpy
import paddle
import paddle.static as static
from paddle.static import ExponentialMovingAverage
paddle.enable_static()
data = static.data(name='x', shape=[-1, 5], dtype='float32')
hidden = static.nn.fc(x=data, size=10)
cost = paddle.mean(hidden)
test_program = static.default_main_program().clone(for_test=True)
optimizer = paddle.optimizer.Adam(learning_rate=0.001)
optimizer.minimize(cost)
ema = ExponentialMovingAverage(0.999)
ema.update()
place = paddle.CPUPlace()
exe = static.Executor(place)
exe.run(static.default_startup_program())
for pass_id in range(3):
for batch_id in range(6):
data = numpy.random.random(size=(10, 5)).astype('float32')
exe.run(program=static.default_main_program(),
feed={'x': data},
fetch_list=[cost.name])
# usage 1
with ema.apply(exe):
data = numpy.random.random(size=(10, 5)).astype('float32')
exe.run(program=test_program,
feed={'x': data},
fetch_list=[hidden.name])
# usage 2
with ema.apply(exe, need_restore=False):
data = numpy.random.random(size=(10, 5)).astype('float32')
exe.run(program=test_program,
feed={'x': data},
fetch_list=[hidden.name])
ema.restore(exe)
"""
def __init__(self, decay=0.999, thres_steps=None, name=None):
if in_dygraph_mode():
raise Exception(
"In dygraph, don't support ExponentialMovingAverage."
)
self._decay = decay
self._thres_steps = thres_steps
self._name = name if name is not None else ''
self._decay_var = self._get_ema_decay()
self._step_counter_name = "@EMA_STEP_COUNTER@"
self._params_tmps = []
for param in default_main_program().global_block().all_parameters():
if param.do_model_average:
tmp = param.block.create_var(
name=unique_name.generate(
".".join([self._name + param.name, 'ema_tmp'])
),
dtype=param.dtype,
persistable=False,
stop_gradient=True,
)
self._params_tmps.append((param, tmp))
self._ema_vars = {}
for param, tmp in self._params_tmps:
with param.block.program._optimized_guard([param, tmp]), name_scope(
'moving_average'
):
self._ema_vars[param.name] = self._create_ema_vars(param)
self.apply_program = Program()
block = self.apply_program.global_block()
with program_guard(main_program=self.apply_program):
decay_pow, global_step = self._get_decay_pow(block)
for param, tmp in self._params_tmps:
param = block._clone_variable(param)
tmp = block._clone_variable(tmp)
ema = block._clone_variable(self._ema_vars[param.name])
paddle.assign(param, output=tmp)
# bias correction
param_val = paddle.static.nn.cond(
global_step > 0,
lambda: ema / (1.0 - decay_pow),
lambda: ema,
)
paddle.assign(param_val, output=param)
self.restore_program = Program()
block = self.restore_program.global_block()
with program_guard(main_program=self.restore_program):
for param, tmp in self._params_tmps:
tmp = block._clone_variable(tmp)
param = block._clone_variable(param)
paddle.assign(tmp, output=param)
def _get_ema_decay(self):
with default_main_program()._lr_schedule_guard():
decay_var = paddle.static.create_global_var(
shape=[1],
value=self._decay,
dtype='float32',
persistable=True,
name="scheduled_ema_decay_rate",
)
if self._thres_steps is not None:
decay_t = (self._thres_steps + 1.0) / (self._thres_steps + 10.0)
decay_val = paddle.static.nn.cond(
decay_t < self._decay,
lambda: decay_t,
lambda: np.array([self._decay], dtype=np.float32),
)
paddle.assign(decay_val, decay_var)
return decay_var
def _get_decay_pow(self, block):
global_step = paddle.static.create_global_var(
name=self._step_counter_name,
shape=[1],
value=0,
dtype='int64',
persistable=True,
)
global_step = paddle.cast(global_step, "float32")
decay_var = block._clone_variable(self._decay_var)
decay_pow_acc = paddle.pow(decay_var, global_step)
return decay_pow_acc, global_step
def _create_ema_vars(self, param):
param_ema = paddle.static.create_global_var(
name=unique_name.generate(self._name + param.name + '_ema'),
shape=param.shape,
value=0.0,
dtype=param.dtype,
persistable=True,
)
return param_ema
def update(self):
"""
Update Exponential Moving Average. Should only call this method in
train program.
"""
global_step = layers.autoincreased_step_counter(
counter_name=self._step_counter_name
)
param_master_emas = []
for param, tmp in self._params_tmps:
with param.block.program._optimized_guard([param, tmp]), name_scope(
'moving_average'
):
param_ema = self._ema_vars[param.name]
if param.name + '.master' in self._ema_vars:
master_ema = self._ema_vars[param.name + '.master']
param_master_emas.append([param_ema, master_ema])
else:
ema_t = param_ema * self._decay_var + param * (
1 - self._decay_var
)
paddle.assign(ema_t, output=param_ema)
# for fp16 params
for param_ema, master_ema in param_master_emas:
default_main_program().global_block().append_op(
type="cast",
inputs={"X": master_ema},
outputs={"Out": param_ema},
attrs={
"in_dtype": master_ema.dtype,
"out_dtype": param_ema.dtype,
},
)
@signature_safe_contextmanager
def apply(self, executor, need_restore=True):
"""
Apply moving average to parameters for evaluation.
Args:
executor (Executor): The Executor to execute applying.
need_restore (bool, optional): Whether to restore parameters after
applying. Default True.
"""
executor.run(self.apply_program)
try:
yield
finally:
if need_restore:
self.restore(executor)
def restore(self, executor):
"""Restore parameters.
Args:
executor (Executor): The Executor to execute restoring.
"""
executor.run(self.restore_program)
......@@ -72,7 +72,7 @@ class TestASPHelperPruningBase(unittest.TestCase):
)
)
optimizer = paddle.incubate.asp.decorate(
fluid.optimizer.SGD(learning_rate=0.01)
paddle.optimizer.SGD(learning_rate=0.01)
)
optimizer.minimize(loss, self.startup_program)
......
......@@ -275,7 +275,7 @@ class TestASPStaticCustomerizedPruneFunc(unittest.TestCase):
)
)
optimizer = sparsity.decorate(
fluid.optimizer.SGD(learning_rate=0.01)
paddle.optimizer.SGD(learning_rate=0.01)
)
optimizer.minimize(loss, self.startup_program)
......
......@@ -56,7 +56,7 @@ class TestASPStaticOptimize(unittest.TestCase):
use_softmax=False,
)
)
self.optimizer = fluid.optimizer.SGD(learning_rate=0.01)
self.optimizer = paddle.optimizer.SGD(learning_rate=0.01)
def test_get_not_ASP_relevant_vars(self):
def check_params(params, params_from_asp):
......
......@@ -77,7 +77,7 @@ class TestASPStaticPruningBase(unittest.TestCase):
)
)
optimizer = paddle.incubate.asp.decorate(
fluid.optimizer.SGD(learning_rate=0.01)
paddle.optimizer.SGD(learning_rate=0.01)
)
optimizer.minimize(loss, self.startup_program)
......
......@@ -153,7 +153,7 @@ class TestASPStaticOptimize(unittest.TestCase):
use_softmax=False,
)
)
self.optimizer = fluid.optimizer.SGD(learning_rate=0.01)
self.optimizer = paddle.optimizer.SGD(learning_rate=0.01)
self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
self.optimizer.minimize(self.loss, self.startup_program)
......
......@@ -92,7 +92,7 @@ class TestFleetWithASPSharding(unittest.TestCase):
)
with fluid.program_guard(train_prog, startup_prog):
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy
)
......
......@@ -71,7 +71,7 @@ class TestFleetWithASPStatic(unittest.TestCase):
)
with fluid.program_guard(train_prog, startup_prog):
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy
)
......
......@@ -132,7 +132,7 @@ def train():
train_program, start_program
)
optimizer = paddle.fluid.optimizer.AdamOptimizer(
optimizer = paddle.optimizer.Adam(
learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
......
......@@ -118,7 +118,7 @@ def train():
vocab_size,
)
optimizer = paddle.fluid.optimizer.AdamOptimizer(
optimizer = paddle.optimizer.Adam(
learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
......
......@@ -38,7 +38,7 @@ def train():
train_program, start_program
)
optimizer = paddle.fluid.optimizer.AdamOptimizer(
optimizer = paddle.optimizer.Adam(
learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
......
......@@ -94,7 +94,7 @@ def train(fetch):
initializer_range=0.02,
)
loss = paddle.nn.CrossEntropyLoss()
optimizer = paddle.fluid.optimizer.AdamOptimizer(
optimizer = paddle.optimizer.Adam(
learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
......
......@@ -75,7 +75,7 @@ def train(fetch):
initializer_range=0.02,
)
loss = paddle.nn.CrossEntropyLoss()
optimizer = paddle.fluid.optimizer.AdamOptimizer(
optimizer = paddle.optimizer.Adam(
learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
......
......@@ -131,7 +131,7 @@ def get_prog(train_program, startup_program, dist_context, rank_id):
)
fleet._user_defined_strategy = fleet.DistributedStrategy()
fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer()
fleet.user_defined_optimizer = paddle.optimizer.Adam()
parallelizer = AutoParallelizer(fleet)
parallelizer._dist_context = dist_context
......
......@@ -111,7 +111,7 @@ def get_program_v3():
criterion = GPTPretrainingCriterion()
loss = criterion(preds, labels, loss_mask)
optimizer = paddle.fluid.optimizer.AdamOptimizer(
optimizer = paddle.optimizer.Adam(
learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
......
......@@ -113,7 +113,7 @@ def get_program_v3():
criterion = GPTPretrainingCriterion()
loss = criterion(preds, labels, loss_mask)
optimizer = paddle.fluid.optimizer.AdamOptimizer(
optimizer = paddle.optimizer.Adam(
learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
......
......@@ -111,7 +111,7 @@ def get_program_v3():
criterion = GPTPretrainingCriterion()
loss = criterion(preds, labels, loss_mask)
optimizer = paddle.fluid.optimizer.AdamOptimizer(
optimizer = paddle.optimizer.Adam(
learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
......
......@@ -75,7 +75,7 @@ def train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16):
avg_cost = paddle.mean(cost)
lr = 5e-3 if use_bf16 else 1e-3
sgd_optimizer = fluid.optimizer.SGD(learning_rate=lr)
sgd_optimizer = paddle.optimizer.SGD(learning_rate=lr)
if use_bf16:
sgd_optimizer = amp.bf16.decorate_bf16(
......
......@@ -132,7 +132,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
# Test program
test_program = fluid.default_main_program().clone(for_test=True)
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimizer = paddle.optimizer.Adam(learning_rate=0.001)
optimizer.minimize(avg_cost)
BATCH_SIZE = 128
......
......@@ -96,7 +96,7 @@ def train(
test_program = fluid.default_main_program().clone(for_test=True)
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimizer = paddle.optimizer.Adam(learning_rate=0.001)
optimizer.minimize(avg_loss)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......
......@@ -27,7 +27,7 @@ import paddle
from paddle import fluid
from paddle.fluid import framework
from paddle.fluid.executor import Executor
from paddle.fluid.optimizer import SGDOptimizer
from paddle.optimizer import SGD
paddle.enable_static()
......@@ -188,7 +188,7 @@ def train(use_cuda, save_dirname, is_local=True):
# test program
test_program = fluid.default_main_program().clone(for_test=True)
sgd_optimizer = SGDOptimizer(learning_rate=0.2)
sgd_optimizer = SGD(learning_rate=0.2)
sgd_optimizer.minimize(avg_cost)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......
......@@ -123,7 +123,7 @@ def train(
else:
raise NotImplementedError()
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
if use_bf16:
sgd_optimizer = paddle.static.amp.bf16.decorate_bf16(
sgd_optimizer,
......
......@@ -119,7 +119,7 @@ class TestMLPAutoParallelizer(unittest.TestCase):
train_program, start_program
)
optimizer = paddle.fluid.optimizer.AdamOptimizer(
optimizer = paddle.optimizer.Adam(
learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
......
......@@ -53,10 +53,8 @@ class TestDistMnist2x2(TestDistRunnerBase):
inference_program = fluid.default_main_program().clone()
# Optimization
opt = fluid.optimizer.MomentumOptimizer(
learning_rate=0.001, momentum=0.9
)
opt = fluid.optimizer.GradientMergeOptimizer(opt, 2)
opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
opt = paddle.incubate.optimizer.GradientMergeOptimizer(opt, 2)
if single_device:
opt.minimize(avg_cost)
else:
......
......@@ -68,7 +68,7 @@ class TestDistMnistGradientMergeRawOptimizer(TestDistRunnerBase):
test_program = paddle.static.default_main_program().clone(for_test=True)
optimizer = paddle.optimizer.Adam(learning_rate=1e-3)
if single_device:
optimizer = fluid.optimizer.GradientMergeOptimizer(
optimizer = paddle.incubate.optimizer.GradientMergeOptimizer(
optimizer,
k_steps=strategy.gradient_merge_configs["k_steps"],
avg=strategy.gradient_merge_configs["avg"],
......
......@@ -175,7 +175,7 @@ class TestFusedAttentionPassWithMP(unittest.TestCase):
out = multi_head_attn(attn_input, attn_mask)
loss = paddle.mean(out)
sgd_optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(loss)
startup_block = startup_prog.global_block()
......
......@@ -55,22 +55,23 @@ def optimizer_setting(params, parameter_list=None):
bd = [step * e for e in ls["epochs"]]
lr = params["lr"]
num_epochs = params["num_epochs"]
if fluid.in_dygraph_mode():
optimizer = fluid.optimizer.Momentum(
optimizer = paddle.optimizer.Momentum(
learning_rate=fluid.layers.cosine_decay(
learning_rate=lr, step_each_epoch=step, epochs=num_epochs
),
momentum=momentum_rate,
regularization=paddle.regularizer.L2Decay(l2_decay),
weight_decay=paddle.regularizer.L2Decay(l2_decay),
parameter_list=parameter_list,
)
else:
optimizer = fluid.optimizer.Momentum(
optimizer = paddle.optimizer.Momentum(
learning_rate=fluid.layers.cosine_decay(
learning_rate=lr, step_each_epoch=step, epochs=num_epochs
),
momentum=momentum_rate,
regularization=paddle.regularizer.L2Decay(l2_decay),
weight_decay=paddle.regularizer.L2Decay(l2_decay),
)
return optimizer
......
......@@ -19,7 +19,6 @@ from legacy_test.test_dist_base import (
)
import paddle
from paddle import fluid
from paddle.fluid.dygraph.base import to_variable
from paddle.nn import Conv2D, SyncBatchNorm
......@@ -79,8 +78,8 @@ class TestSyncBatchNorm(TestParallelDyGraphRunnerBase):
batch_size=32,
drop_last=True,
)
opt = fluid.optimizer.Adam(
learning_rate=1e-3, parameter_list=model.parameters()
opt = paddle.optimizer.Adam(
learning_rate=1e-3, parameters=model.parameters()
)
return model, train_reader, opt
......
......@@ -1089,11 +1089,11 @@ class TestTransformer(TestParallelDyGraphRunnerBase):
fake_data_reader(), TrainTaskConfig.batch_size
)
if naive_optimize:
optimizer = fluid.optimizer.SGD(
learning_rate=0.001, parameter_list=model.parameters()
optimizer = paddle.optimizer.SGD(
learning_rate=0.001, parameters=model.parameters()
)
else:
optimizer = fluid.optimizer.Adam(
optimizer = paddle.optimizer.Adam(
learning_rate=NoamDecay(
ModelHyperParams.d_model,
TrainTaskConfig.warmup_steps,
......@@ -1102,7 +1102,7 @@ class TestTransformer(TestParallelDyGraphRunnerBase):
beta1=TrainTaskConfig.beta1,
beta2=TrainTaskConfig.beta2,
epsilon=TrainTaskConfig.eps,
parameter_list=model.parameters(),
parameters=model.parameters(),
)
return model, train_reader, optimizer
......
......@@ -121,8 +121,8 @@ class TestDistMnist2x2(TestDistRunnerBase):
steps_per_pass = 10
bd = [steps_per_pass * p for p in passes]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
opt = fluid.optimizer.Momentum(
lr_val = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd, values=lr)
opt = paddle.optimizer.Momentum(
learning_rate=lr_val,
momentum=0.9,
grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
......
......@@ -113,8 +113,8 @@ class TestDistMnist2x2(TestDistRunnerBase):
steps_per_pass = 10
bd = [steps_per_pass * p for p in passes]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9)
lr_val = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd, values=lr)
opt = paddle.optimizer.Momentum(learning_rate=lr_val, momentum=0.9)
# Reader
train_reader = paddle.batch(
......
......@@ -91,7 +91,7 @@ class TestModelParallel(TestDistRunnerBase):
rank = fleet.worker_index() if dist_strategy else None
avg_cost = create_model(data_in, rank)
opt = fluid.optimizer.SGD(0.1)
opt = paddle.optimizer.SGD(0.1)
if dist_strategy:
dist_opt = fleet.distributed_optimizer(
......
......@@ -95,7 +95,7 @@ class TestModelParallel(TestDistRunnerBase):
rank = fleet.worker_index() if dist_strategy else None
avg_cost = create_model(data_in, rank)
opt = fluid.optimizer.SGD(0.1)
opt = paddle.optimizer.SGD(0.1)
if dist_strategy:
dist_opt = fleet.distributed_optimizer(
......
......@@ -85,7 +85,7 @@ class TestModelParallel(TestDistRunnerBase):
rank = fleet.worker_index() if dist_strategy else None
avg_cost = create_model(data_in, rank)
opt = fluid.optimizer.SGD(0.1)
opt = paddle.optimizer.SGD(0.1)
if dist_strategy:
dist_opt = fleet.distributed_optimizer(
......
......@@ -49,7 +49,7 @@ class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
def run_pserver(self, role, strategy):
fleet.init(role)
avg_cost, x, y = self.net()
optimizer = fluid.optimizer.SGD(0.01)
optimizer = paddle.optimizer.SGD(0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost)
......@@ -62,7 +62,7 @@ class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
fleet.init(role)
avg_cost, x, y = self.net()
optimizer = fluid.optimizer.SGD(0.01)
optimizer = paddle.optimizer.SGD(0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost)
......
......@@ -20,7 +20,6 @@ import paddle
paddle.enable_static()
from paddle import fluid
from paddle.distributed import fleet
from paddle.distributed.fleet.base import role_maker
......@@ -48,7 +47,7 @@ class TestCommunicator(unittest.TestCase):
fleet.init(role_maker.PaddleCloudRoleMaker())
avg_cost = self.net()
optimizer = fluid.optimizer.SGD(0.01)
optimizer = paddle.optimizer.SGD(0.01)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = False
......
......@@ -16,7 +16,7 @@ import unittest
import paddle
from paddle import regularizer
from paddle.fluid import framework, optimizer
from paddle.fluid import framework
from paddle.nn import clip
paddle.enable_static()
......@@ -79,9 +79,11 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
)
if use_recompute:
dgc_momentum_optimizer = optimizer.RecomputeOptimizer(
dgc_momentum_optimizer = (
paddle.incubate.optimizer.RecomputeOptimizer(
dgc_momentum_optimizer
)
)
dgc_momentum_optimizer._set_checkpoints([])
dgc_momentum_optimizer.get_accumulators = (
dgc_momentum_optimizer._optimizer.get_accumulators
......
......@@ -242,7 +242,7 @@ class TestCreateDefaultStrategy(unittest.TestCase):
fleet.init(role)
def type_error_optimizer():
optimizer = fluid.optimizer.SGD(0.0001)
optimizer = paddle.optimizer.SGD(0.0001)
optimizer = fleet.distributed_optimizer(optimizer)
self.assertRaises(TypeError, type_error_optimizer)
......@@ -264,7 +264,7 @@ class TestHalfAsyncStrategy(unittest.TestCase):
half_async_config.geo_sgd_mode = False
half_async_config.runtime_split_send_recv = False
optimizer = fluid.optimizer.SGD(0.0001)
optimizer = paddle.optimizer.SGD(0.0001)
optimizer = fleet.distributed_optimizer(optimizer, half_async_config)
......@@ -284,7 +284,7 @@ class TestDebugInfo(unittest.TestCase):
)
fleet.init(role)
optimizer = fluid.optimizer.SGD(0.0001)
optimizer = paddle.optimizer.SGD(0.0001)
strategy = StrategyFactory.create_sync_strategy()
strategy.set_debug_opt(
{
......
......@@ -31,9 +31,7 @@ class TestFleetAMPOptimizer(TestFleetMetaOptimizer):
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
opt = fluid.optimizer.MomentumOptimizer(
learning_rate=0.001, momentum=0.9
)
opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
opt = AMPOptimizer(opt)
self.set_strategy(strategy, 'amp')
......@@ -50,9 +48,7 @@ class TestFleetAMPOptimizer(TestFleetMetaOptimizer):
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
opt = fluid.optimizer.MomentumOptimizer(
learning_rate=0.001, momentum=0.9
)
opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
opt = AMPOptimizer(opt)
self.set_strategy(strategy, 'amp')
......@@ -71,9 +67,7 @@ class TestFleetAMPOptimizer(TestFleetMetaOptimizer):
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
opt = fluid.optimizer.MomentumOptimizer(
learning_rate=0.001, momentum=0.9
)
opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
opt = AMPOptimizer(opt)
self.set_strategy(strategy, 'amp')
......
......@@ -47,7 +47,7 @@ class FleetTest(unittest.TestCase):
input=predict, label=label, reduction='none', use_softmax=False
)
avg_loss = paddle.mean(loss)
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001)
optimizer = paddle.optimizer.Adam(learning_rate=0.001)
dist_optimizer = fleet.distributed_optimizer(optimizer)
dist_optimizer.minimize(avg_loss)
......
......@@ -33,9 +33,7 @@ class TestFleetDGCOptimizer(TestFleetMetaOptimizer):
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'dgc')
opt = fluid.optimizer.MomentumOptimizer(
learning_rate=0.001, momentum=0.9
)
opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
dgc_opt = DGCOptimizer(opt)
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
......@@ -50,9 +48,7 @@ class TestFleetDGCOptimizer(TestFleetMetaOptimizer):
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'dgc')
opt = fluid.optimizer.MomentumOptimizer(
learning_rate=0.001, momentum=0.9
)
opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
dgc_opt = DGCOptimizer(opt)
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
......@@ -70,9 +66,7 @@ class TestFleetDGCOptimizer(TestFleetMetaOptimizer):
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'dgc')
opt = fluid.optimizer.MomentumOptimizer(
learning_rate=0.001, momentum=0.9
)
opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
dgc_opt = DGCOptimizer(opt)
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
......
......@@ -56,7 +56,7 @@ class TestFleetFP16CompressOptimizer(unittest.TestCase):
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......@@ -83,7 +83,7 @@ class TestFleetFP16CompressOptimizer(unittest.TestCase):
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog, dtype='float16')
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -70,7 +70,7 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
optimizer = paddle.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......@@ -83,9 +83,7 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.1, momentum=0.9
)
optimizer = paddle.optimizer.Momentum(learning_rate=0.1, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......@@ -98,7 +96,7 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
optimizer = paddle.optimizer.Adam(learning_rate=0.01)
strategy.lamb_configs = {
'lamb_weight_decay': 0.01,
'exclude_from_weight_decay': ['.b_0'],
......@@ -146,7 +144,7 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
'exclude_from_weight_decay': [],
}
optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
optimizer = paddle.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -72,9 +72,7 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, momentum=0.9
)
optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......@@ -87,7 +85,7 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
optimizer = paddle.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......@@ -100,9 +98,7 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, momentum=0.9
)
optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......@@ -153,9 +149,7 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
"exclude_from_weight_decay": ["batch_norm", ".b"],
}
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, momentum=0.9
)
optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -51,7 +51,7 @@ class TestFleetMetaOptimizerBase(unittest.TestCase):
)
avg_cost = paddle.mean(x=cost)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
opt = MetaOptimizerBase(optimizer)
opt_ops, params_grads = opt.minimize(avg_cost)
opt.apply_optimize(
......
......@@ -78,7 +78,7 @@ class TestFleetMetaOptimizer(unittest.TestCase):
with fluid.unique_name.guard():
avg_cost = self.net()
optimizer = paddle.fluid.optimizer.Adam(0.01)
optimizer = paddle.optimizer.Adam(0.01)
optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy
)
......@@ -102,7 +102,7 @@ class TestFleetMetaOptimizer(unittest.TestCase):
with fluid.unique_name.guard():
avg_cost = self.net()
optimizer = paddle.fluid.optimizer.Adam(0.01)
optimizer = paddle.optimizer.Adam(0.01)
optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy
)
......
......@@ -74,7 +74,7 @@ class TestFleetMetaOptimizer(unittest.TestCase):
"checkpoint_shape": [],
}
optimizer = paddle.fluid.optimizer.Adam(0.01)
optimizer = paddle.optimizer.Adam(0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -64,7 +64,7 @@ class TestFleetWithQAT(unittest.TestCase):
mse = paddle.nn.MSELoss()
out = model(input_x)
loss = mse(out, input_y)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy
)
......
......@@ -47,7 +47,7 @@ class TestFleetMetaOptimizer(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.without_graph_optimization = True
optimizer = paddle.fluid.optimizer.Adam(0.01)
optimizer = paddle.optimizer.Adam(0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -30,9 +30,7 @@ class TestFleetRecomputeMetaOptimizer(TestFleetMetaOptimizer):
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'recompute')
opt = fluid.optimizer.MomentumOptimizer(
learning_rate=0.001, momentum=0.9
)
opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
opt = RecomputeOptimizer(opt)
opt.user_defined_strategy = strategy
params_grads = opt.backward(avg_cost, startup_prog)
......@@ -48,9 +46,7 @@ class TestFleetRecomputeMetaOptimizer(TestFleetMetaOptimizer):
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'recompute')
opt = fluid.optimizer.MomentumOptimizer(
learning_rate=0.001, momentum=0.9
)
opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
opt = RecomputeOptimizer(opt)
opt.user_defined_strategy = strategy
params_grads = opt.backward(avg_cost, startup_prog)
......@@ -68,9 +64,7 @@ class TestFleetRecomputeMetaOptimizer(TestFleetMetaOptimizer):
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'recompute')
opt = fluid.optimizer.MomentumOptimizer(
learning_rate=0.001, momentum=0.9
)
opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
opt = RecomputeOptimizer(opt)
opt.user_defined_strategy = strategy
params_grads = opt.backward(avg_cost, startup_prog)
......
......@@ -609,12 +609,6 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
'c_reduce_sum',
'c_reduce_sum',
'c_sync_comm_stream',
'scale',
'sum',
'scale',
'sum',
'scale',
'sum',
'momentum',
'momentum',
'momentum',
......
......@@ -106,7 +106,7 @@ class TestFleetMetaOptimizer(unittest.TestCase):
y = model_a(input_x)
loss = paddle.mean(y)
optimizer = paddle.fluid.optimizer.Adam(0.01)
optimizer = paddle.optimizer.Adam(0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(loss)
ref_ops = [
......
......@@ -222,8 +222,8 @@ class TestAmpScaler(unittest.TestCase):
stride=2,
act='relu',
)
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters()
optimizer = paddle.optimizer.SGD(
learning_rate=0.01, parameters=model.parameters()
)
scaler = paddle.amp.AmpScaler(init_loss_scaling=1024)
data = fluid.dygraph.to_variable(inp_np)
......@@ -331,8 +331,8 @@ class TestAmpScaler(unittest.TestCase):
params_init = {}
for param in model.parameters():
params_init[param.name] = param.numpy()
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters()
optimizer = paddle.optimizer.SGD(
learning_rate=0.01, parameters=model.parameters()
)
scaler = paddle.amp.AmpScaler(init_loss_scaling=1024)
data = fluid.dygraph.to_variable(inp_np)
......
......@@ -58,8 +58,8 @@ class AMPTest(unittest.TestCase):
out = model(x)
loss = mse(out, label)
opt = paddle.fluid.optimizer.Adam(
learning_rate=0.0001, parameter_list=model.parameters()
opt = paddle.optimizer.Adam(
learning_rate=0.0001, parameters=model.parameters()
) # 定义优化器
opt = paddle.static.amp.decorate(
opt, init_loss_scaling=128.0, use_dynamic_loss_scaling=True
......
......@@ -27,7 +27,6 @@
import unittest
import paddle
from paddle import fluid
from paddle.incubate.distributed.fleet.collective import (
CollectiveOptimizer,
DistributedStrategy,
......@@ -36,11 +35,11 @@ from paddle.incubate.distributed.fleet.collective import (
class CollectiveOptimizerTest(unittest.TestCase):
def test_ds_as_None(self):
optimizer = fluid.optimizer.AdamOptimizer()
optimizer = paddle.optimizer.Adam()
dist_optimizer = CollectiveOptimizer(optimizer, strategy=None)
def test_recompute_checkpoints(self):
optimizer = fluid.optimizer.AdamOptimizer()
optimizer = paddle.optimizer.Adam()
dist_strategy = DistributedStrategy()
dist_strategy.forward_recompute = True
dist_strategy.recompute_checkpoints = "NoneListTest"
......@@ -52,8 +51,8 @@ class CollectiveOptimizerTest(unittest.TestCase):
self.assertRaises(ValueError, dist_optimizer.minimize, None)
def test_recompute_strategy(self):
optimizer = fluid.optimizer.AdamOptimizer()
optimizer = fluid.optimizer.RecomputeOptimizer(optimizer)
optimizer = paddle.optimizer.Adam()
optimizer = paddle.incubate.optimizer.RecomputeOptimizer(optimizer)
dist_strategy = DistributedStrategy()
dist_strategy.forward_recompute = True
dist_strategy.recompute_checkpoints = ["Test"]
......@@ -61,7 +60,7 @@ class CollectiveOptimizerTest(unittest.TestCase):
self.assertRaises(ValueError, dist_optimizer.minimize, None)
def test_amp_strategy(self):
optimizer = fluid.optimizer.AdamOptimizer()
optimizer = paddle.optimizer.Adam()
optimizer = paddle.static.amp.decorate(
optimizer, init_loss_scaling=1.0, use_dynamic_loss_scaling=True
)
......
......@@ -125,7 +125,7 @@ class TestCorrelationOp(unittest.TestCase):
)
loss = paddle.mean(out)
optimizer = fluid.optimizer.Momentum(0.0001, 0.9)
optimizer = paddle.optimizer.Momentum(0.0001, 0.9)
optimizer.minimize(loss)
place = fluid.CUDAPlace(0)
......
......@@ -126,7 +126,7 @@ def train(use_pure_fp16=True, use_nesterov=False, optimizer=""):
multi_precision=True,
)
elif optimizer == "Lars":
optimizer = paddle.fluid.optimizer.LarsMomentumOptimizer(
optimizer = paddle.incubate.optimizer.LarsMomentumOptimizer(
learning_rate=0.001, momentum=0.9, multi_precision=use_pure_fp16
)
else:
......
......@@ -222,7 +222,7 @@ class AutoPallelPassTestBase(DistPassTestBase):
clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
if kwargs.get('optimizer', None) == "LarsMomentum":
optimizer = paddle.fluid.optimizer.LarsMomentumOptimizer(
optimizer = paddle.incubate.optimizer.LarsMomentumOptimizer(
learning_rate=0.001, momentum=0.9
)
else:
......
......@@ -183,7 +183,7 @@ class TestGradientMergePass(AutoPallelPassTestBase):
loss = mlp_forward(input, label, hidden_size)
optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.01)
optimizer = paddle.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer)
(
_,
......
......@@ -106,7 +106,7 @@ class TestBert(unittest.TestCase):
config=bert_config, weight_sharing=False, use_fp16=False
)
optimizer = fluid.optimizer.Adam(parameter_list=bert.parameters())
optimizer = paddle.optimizer.Adam(parameters=bert.parameters())
step_idx = 0
speed_list = []
for input_data in data_loader():
......
......@@ -448,10 +448,10 @@ def optimizer(cfg, parameter_list):
lr_decay = cfg.learning_rate_decay
l2_weight_decay = cfg.l2_weight_decay
lr = [base_lr, base_lr * lr_decay]
optimizer = fluid.optimizer.Adam(
fluid.layers.piecewise_decay(boundaries=bd, values=lr),
parameter_list=parameter_list,
regularization=paddle.regularizer.L2Decay(coeff=l2_weight_decay),
optimizer = paddle.optimizer.Adam(
paddle.optimizer.lr.PiecewiseDecay(boundaries=bd, values=lr),
parameters=parameter_list,
weight_decay=paddle.regularizer.L2Decay(coeff=l2_weight_decay),
)
return optimizer
......
......@@ -96,8 +96,8 @@ class TestCacheProgramWithOptimizer(unittest.TestCase):
with fluid.dygraph.guard(fluid.CPUPlace()):
dygraph_net = self.dygraph_class()
adam = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, parameter_list=dygraph_net.parameters()
adam = paddle.optimizer.Adam(
learning_rate=0.001, parameters=dygraph_net.parameters()
)
loss_data = []
for batch_id in range(self.batch_num):
......
......@@ -531,8 +531,8 @@ class Args:
def optimizer_setting(parameters):
lr = 0.0002
optimizer = fluid.optimizer.Adam(
learning_rate=fluid.layers.piecewise_decay(
optimizer = paddle.optimizer.Adam(
learning_rate=paddle.optimizer.lr.PiecewiseDecay(
boundaries=[
100 * step_per_epoch,
120 * step_per_epoch,
......@@ -542,7 +542,7 @@ def optimizer_setting(parameters):
],
values=[lr, lr * 0.8, lr * 0.6, lr * 0.4, lr * 0.2, lr * 0.1],
),
parameter_list=parameters,
parameters=parameters,
beta1=0.5,
)
return optimizer
......
......@@ -545,9 +545,9 @@ class TestLACModel(unittest.TestCase):
train_loader = create_dataloader(reader, place)
model = LexNet(args)
optimizer = fluid.optimizer.AdamOptimizer(
optimizer = paddle.optimizer.Adam(
learning_rate=args.base_learning_rate,
parameter_list=model.parameters(),
parameters=model.parameters(),
)
chunk_eval = ChunkEval(
int(math.ceil((args.num_labels - 1) / 2.0)), "IOB"
......
......@@ -25,9 +25,9 @@ import paddle
from paddle import fluid
from paddle.fluid.dygraph import to_variable
from paddle.fluid.dygraph.base import switch_to_static_graph
from paddle.fluid.optimizer import AdamOptimizer
from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
from paddle.nn import Linear
from paddle.optimizer import Adam
SEED = 2020
......@@ -196,9 +196,7 @@ class TestMNISTWithToStatic(TestMNIST):
mnist = MNIST()
if to_static:
mnist = paddle.jit.to_static(mnist)
adam = AdamOptimizer(
learning_rate=0.001, parameter_list=mnist.parameters()
)
adam = Adam(learning_rate=0.001, parameters=mnist.parameters())
for epoch in range(self.epoch_num):
start = time()
......
......@@ -20,7 +20,7 @@ from dygraph_to_static_util import test_and_compare_with_new_ir
from test_mnist import MNIST, SEED, TestMNIST
import paddle
from paddle.fluid.optimizer import AdamOptimizer
from paddle.optimizer import Adam
if paddle.fluid.is_compiled_with_cuda():
paddle.fluid.set_flags({'FLAGS_cudnn_deterministic': True})
......@@ -58,9 +58,7 @@ class TestAMP(TestMNIST):
print("Successfully to apply @to_static.")
mnist = paddle.jit.to_static(mnist)
adam = AdamOptimizer(
learning_rate=0.001, parameter_list=mnist.parameters()
)
adam = Adam(learning_rate=0.001, parameters=mnist.parameters())
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
......
......@@ -446,11 +446,11 @@ class MobileNetV2(paddle.nn.Layer):
def create_optimizer(args, parameter_list):
optimizer = fluid.optimizer.Momentum(
optimizer = paddle.optimizer.Momentum(
learning_rate=args.lr,
momentum=args.momentum_rate,
regularization=paddle.regularizer.L2Decay(args.l2_decay),
parameter_list=parameter_list,
weight_decay=paddle.regularizer.L2Decay(args.l2_decay),
parameters=parameter_list,
)
return optimizer
......
......@@ -21,8 +21,8 @@ import numpy as np
import paddle
from paddle import fluid
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.optimizer import SGDOptimizer
from paddle.jit.api import to_static
from paddle.optimizer import SGD
PRINT_STEP = 20
SEED = 2020
......@@ -247,9 +247,7 @@ def train(place):
dropout=dropout,
)
sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=ptb_model.parameters()
)
sgd = SGD(learning_rate=1e-3, parameters=ptb_model.parameters())
for epoch_id in range(max_epoch):
total_loss = 0.0
......
......@@ -46,11 +46,11 @@ if fluid.is_compiled_with_cuda():
def optimizer_setting(parameter_list=None):
optimizer = fluid.optimizer.Momentum(
optimizer = paddle.optimizer.Momentum(
learning_rate=base_lr,
momentum=momentum_rate,
regularization=paddle.regularizer.L2Decay(l2_decay),
parameter_list=parameter_list,
weight_decay=paddle.regularizer.L2Decay(l2_decay),
parameters=parameter_list,
)
return optimizer
......
......@@ -64,8 +64,8 @@ class TestDyToStaticSaveInferenceModel(unittest.TestCase):
x = fluid.dygraph.to_variable(x_data)
layer = SimpleFcLayer(fc_size)
adam = fluid.optimizer.SGD(
learning_rate=0.1, parameter_list=layer.parameters()
adam = paddle.optimizer.SGD(
learning_rate=0.1, parameters=layer.parameters()
)
for i in range(5):
......
......@@ -24,8 +24,8 @@ import paddle
import paddle.nn.functional as F
from paddle import fluid, nn
from paddle.fluid import core
from paddle.fluid.optimizer import AdamOptimizer
from paddle.nn import BatchNorm
from paddle.optimizer import Adam
np.random.seed(2020)
......@@ -75,9 +75,7 @@ class TestDyToStaticSaveLoad(unittest.TestCase):
paddle.jit.enable_to_static(True)
x = fluid.dygraph.to_variable(x_data)
net = Linear(32, 64)
adam = AdamOptimizer(
learning_rate=0.1, parameter_list=net.parameters()
)
adam = Adam(learning_rate=0.1, parameters=net.parameters())
for i in range(batch_num):
static_out, static_loss = net(x)
......
......@@ -78,13 +78,13 @@ def optimizer_setting(params, parameter_list):
bd = [step * e for e in ls["epochs"]]
lr = params["lr"]
num_epochs = params["num_epochs"]
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.cosine_decay(
learning_rate=lr, step_each_epoch=step, epochs=num_epochs
optimizer = paddle.optimizer.Momentum(
learning_rate=paddle.optimizer.lr.CosineAnnealingDecay(
learning_rate=lr, T_max=num_epochs
),
momentum=momentum_rate,
regularization=paddle.regularizer.L2Decay(l2_decay),
parameter_list=parameter_list,
weight_decay=paddle.regularizer.L2Decay(l2_decay),
parameters=parameter_list,
)
return optimizer
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册