未验证 提交 3cca2a87 编写于 作者: L LoneRanger 提交者: GitHub

remove the extend_optimizer_with_weight_decay function (#55007)

* remove the extend_optimizer_with_weight_decay function

* Update __init__.py

* fix bug

* fix bug
上级 b3c26de8
...@@ -14,13 +14,9 @@ ...@@ -14,13 +14,9 @@
# limitations under the License. # limitations under the License.
from . import extend_optimizer
from .extend_optimizer import *
from . import optimizer from . import optimizer
from .optimizer import * from .optimizer import *
__all__ = [] __all__ = []
__all__ += extend_optimizer.__all__
__all__ += optimizer.__all__ __all__ += optimizer.__all__
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import extend_optimizer_with_weight_decay
from .extend_optimizer_with_weight_decay import *
__all__ = []
__all__ += extend_optimizer_with_weight_decay.__all__
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid
from paddle.fluid import framework as framework
__all__ = ["extend_with_decoupled_weight_decay"]
class DecoupledWeightDecay:
def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs):
if not isinstance(coeff, float) and not isinstance(
coeff, framework.Variable
):
raise TypeError("coeff should be float or Variable.")
self._params_name = set()
self._apply_decay_param_fun = apply_decay_param_fun
self._coeff = coeff
super().__init__(**kwargs)
def _scale_parameters(self, params_and_grads):
"""
Adds weight decay ops.
scaled_parameter = parameter * coeff
Args:
params_and_grads: A list of (parameters, gradients) pairs,
the parameters need to decay.
Raises:
Exception: The type of coeff and parameter is not consistent.
"""
if isinstance(self._coeff, float) and self._coeff == 0.0:
return
scaled_params = []
for param, grad in params_and_grads:
# If no gradient then we don't need to do anything
if grad is None:
continue
if (
self._apply_decay_param_fun is not None
and not self._apply_decay_param_fun(param.name)
):
continue
if isinstance(self._coeff, float):
assert (
param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32
), (
"the type of coeff(float) and parameter(%s) is not consistent."
% (self._coeff.dtype)
)
else:
assert self._coeff.dtype == param.dtype, (
"the type of coeff(%s) and parameter(%s) is not consistent."
% (self._coeff.dtype, param.dtype)
)
with param.block.program._optimized_guard(
[param, grad]
), framework.name_scope('weight decay'):
assert param.name not in self._params_name
scaled_params.append((param, grad, param * self._coeff))
self._params_name.add(param.name)
return scaled_params
def backward(self, **kargs):
return super().backward(**kargs)
def apply_optimize(self, **kargs):
return super().apply_optimize(**kargs)
def minimize(
self, loss, startup_program=None, parameter_list=None, no_grad_set=None
):
params_grads = self.backward(
loss=loss,
startup_program=startup_program,
parameter_list=parameter_list,
no_grad_set=no_grad_set,
)
scaled_params = self._scale_parameters(params_grads)
for p_grad_sgrad in scaled_params:
param, grad, scaled_param = p_grad_sgrad
with param.block.program._optimized_guard(
[param, grad]
), framework.name_scope('weight decay'):
updated_param = paddle.subtract(x=param, y=scaled_param)
paddle.assign(updated_param, output=param)
optimize_ops = self.apply_optimize(
loss=loss,
params_grads=params_grads,
startup_program=startup_program,
)
return optimize_ops, params_grads
def __str__(self):
return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
def extend_with_decoupled_weight_decay(base_optimizer):
"""
extend_with_decoupled_weight_decay is a decorator function, it returns an
optimizer class with decoupled weight decay. The returned optimizer will
apply weight decay on the optimized parameters with the parameters before
optimization, i.e: new_parameter = optimized_parameter - parameter * coeff.
The details of decoupled weight decay yplease refer to this
`DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
Args:
base_optimizer (Optimizer): The base_optimizer should be a derived class of Optimizer.
Returns:
OptimizerWithDecoupledWeightDecay: the optimizer with decouple weight decay.
Examples:
.. code-block:: python
AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
fluid.optimizer.Adam)
optimizer = AdamW(learning_rate=0.1,
weight_decay=0.01)
optimizer.minimize(cost)
"""
if not issubclass(base_optimizer, paddle.fluid.optimizer.Optimizer):
raise TypeError(
"The input(base_optimizer) should be a derived class of Optimizer."
)
class OptimizerWithDecoupledWeightDecay(
DecoupledWeightDecay, base_optimizer
):
"""
OptimizerWithDecoupledWeightDecay is used to update the optimized parameters
with the parameters before optimization. For more information, please refer:
https://arxiv.org/pdf/1711.05101.pdf.
Args:
weight_decay (float|Variable): The weight decay coefficient, it can be
float or Variable.
apply_decay_param_fun (function|None): If it is not None,
only variables that makes apply_decay_param_fun(variable)==True
will be updated. It only works when we want to specify variables.
Default: None.
"""
def __init__(self, weight_decay, apply_decay_param_fun=None, **kwargs):
super().__init__(weight_decay, apply_decay_param_fun, **kwargs)
return OptimizerWithDecoupledWeightDecay
...@@ -427,7 +427,6 @@ packages=['paddle', ...@@ -427,7 +427,6 @@ packages=['paddle',
'paddle.fluid.proto.profiler', 'paddle.fluid.proto.profiler',
'paddle.fluid.layers', 'paddle.fluid.layers',
'paddle.fluid.contrib', 'paddle.fluid.contrib',
'paddle.fluid.contrib.extend_optimizer',
'paddle.fluid.incubate', 'paddle.fluid.incubate',
'paddle.incubate.distributed.fleet', 'paddle.incubate.distributed.fleet',
'paddle.fluid.incubate.checkpoint', 'paddle.fluid.incubate.checkpoint',
......
...@@ -1431,7 +1431,6 @@ def get_setup_parameters(): ...@@ -1431,7 +1431,6 @@ def get_setup_parameters():
'paddle.fluid.proto.profiler', 'paddle.fluid.proto.profiler',
'paddle.fluid.layers', 'paddle.fluid.layers',
'paddle.fluid.contrib', 'paddle.fluid.contrib',
'paddle.fluid.contrib.extend_optimizer',
'paddle.fluid.incubate', 'paddle.fluid.incubate',
'paddle.incubate.distributed.fleet', 'paddle.incubate.distributed.fleet',
'paddle.fluid.incubate.checkpoint', 'paddle.fluid.incubate.checkpoint',
......
...@@ -20,5 +20,4 @@ py_test_modules( ...@@ -20,5 +20,4 @@ py_test_modules(
FLAGS_conv_workspace_size_limit=1000) FLAGS_conv_workspace_size_limit=1000)
set_tests_properties(test_image_classification_fp16 PROPERTIES TIMEOUT 120) set_tests_properties(test_image_classification_fp16 PROPERTIES TIMEOUT 120)
set_tests_properties(test_weight_decay_extend PROPERTIES TIMEOUT 120)
set_tests_properties(test_multi_precision_fp16_train PROPERTIES TIMEOUT 120) set_tests_properties(test_multi_precision_fp16_train PROPERTIES TIMEOUT 120)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import unittest
from functools import partial
import numpy as np
import paddle
from paddle import fluid
paddle.enable_static()
SEED = 2020
def fake_imdb_reader(
word_dict_size,
sample_num,
lower_seq_len=100,
upper_seq_len=200,
class_dim=2,
):
def __reader__():
for _ in range(sample_num):
length = np.random.random_integers(
low=lower_seq_len, high=upper_seq_len, size=[1]
)[0]
ids = np.random.random_integers(
low=0, high=word_dict_size - 1, size=[length]
).astype('int64')
label = np.random.random_integers(
low=0, high=class_dim - 1, size=[1]
).astype('int64')[0]
yield ids, label
return __reader__
def get_places():
places = [fluid.CPUPlace()]
if fluid.core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
return places
@contextlib.contextmanager
def prog_scope_guard(main_prog, startup_prog):
scope = fluid.core.Scope()
with fluid.unique_name.guard():
with fluid.scope_guard(scope):
with fluid.program_guard(main_prog, startup_prog):
yield
def bow_net(
data,
label,
dict_dim,
is_sparse=False,
emb_dim=128,
hid_dim=128,
hid_dim2=96,
class_dim=2,
):
"""
BOW net
This model is from https://github.com/PaddlePaddle/models:
fluid/PaddleNLP/text_classification/nets.py
"""
emb = fluid.layers.embedding(
input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]
)
bow = paddle.static.nn.sequence_lod.sequence_pool(
input=emb, pool_type='sum'
)
bow_tanh = paddle.tanh(bow)
fc_1 = paddle.static.nn.fc(x=bow_tanh, size=hid_dim, activation="tanh")
fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim2, activation="tanh")
prediction = paddle.static.nn.fc(
x=[fc_2], size=class_dim, activation="softmax"
)
cost = paddle.nn.functional.cross_entropy(
input=prediction, label=label, reduction='none', use_softmax=False
)
avg_cost = paddle.mean(x=cost)
return avg_cost
class TestWeightDecay(unittest.TestCase):
def setUp(self):
# set seed
np.random.seed(SEED)
paddle.seed(SEED)
paddle.framework.random._manual_program_seed(SEED)
# configs
self.word_dict_len = 5147
batch_size = 2
reader = fake_imdb_reader(self.word_dict_len, batch_size * 100)
reader = paddle.batch(reader, batch_size=batch_size)()
self.train_data = [next(reader) for _ in range(3)]
self.learning_rate = 0.5
def run_program(self, place, feed_list):
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
exe.run(fluid.default_startup_program())
main_prog = fluid.default_main_program()
param_list = [var.name for var in main_prog.block(0).all_parameters()]
param_sum = []
for data in self.train_data:
out = exe.run(
main_prog, feed=feeder.feed(data), fetch_list=param_list
)
p_sum = 0
for v in out:
p_sum += np.sum(np.abs(v))
param_sum.append(p_sum)
return param_sum
def check_weight_decay(self, place, model):
main_prog = fluid.framework.Program()
startup_prog = fluid.framework.Program()
with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
data = paddle.static.data(
name="words", shape=[-1, 1], dtype="int64", lod_level=1
)
label = paddle.static.data(
name="label", shape=[-1, 1], dtype="int64"
)
avg_cost = model(data, label, self.word_dict_len)
AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
fluid.optimizer.Adam
)
optimizer = AdamW(
learning_rate=self.learning_rate,
weight_decay=self.learning_rate,
)
optimizer.minimize(avg_cost)
param_sum = self.run_program(place, [data, label])
return param_sum
def check_weight_decay2(self, place, model):
main_prog = fluid.framework.Program()
startup_prog = fluid.framework.Program()
with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
data = paddle.static.data(
name="words", shape=[-1, 1], dtype="int64", lod_level=1
)
label = paddle.static.data(
name="label", shape=[-1, 1], dtype="int64"
)
avg_cost = model(data, label, self.word_dict_len)
optimizer = fluid.optimizer.Adam(learning_rate=self.learning_rate)
params_grads = optimizer.backward(avg_cost)
param_list = [
(var, var * self.learning_rate)
for var in main_prog.block(0).all_parameters()
]
for params in param_list:
updated_p = paddle.subtract(x=params[0], y=params[1])
paddle.assign(updated_p, output=params[0])
optimizer.apply_optimize(avg_cost, startup_prog, params_grads)
param_sum = self.run_program(place, [data, label])
return param_sum
def test_weight_decay(self):
for place in get_places():
model = partial(bow_net, is_sparse=False)
param_sum1 = self.check_weight_decay(place, model)
param_sum2 = self.check_weight_decay2(place, model)
for i in range(len(param_sum1)):
np.testing.assert_allclose(
param_sum1[i],
param_sum2[i],
rtol=1e-05,
err_msg='Current place: {}, i: {}, sum1: {}, sum2: {}'.format(
place,
i,
param_sum1[i][
~np.isclose(param_sum1[i], param_sum2[i])
],
param_sum2[i][
~np.isclose(param_sum1[i], param_sum2[i])
],
),
)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册