From 3cca2a8772c37161c80c1e1095eef87dbb020d0b Mon Sep 17 00:00:00 2001 From: LoneRanger <836253168@qq.com> Date: Fri, 7 Jul 2023 15:27:23 +0800 Subject: [PATCH] remove the extend_optimizer_with_weight_decay function (#55007) * remove the extend_optimizer_with_weight_decay function * Update __init__.py * fix bug * fix bug --- python/paddle/fluid/contrib/__init__.py | 4 - .../contrib/extend_optimizer/__init__.py | 19 -- .../extend_optimizer_with_weight_decay.py | 163 ------------- python/setup.py.in | 1 - setup.py | 1 - test/contrib/CMakeLists.txt | 1 - test/contrib/test_weight_decay_extend.py | 219 ------------------ 7 files changed, 408 deletions(-) delete mode 100644 python/paddle/fluid/contrib/extend_optimizer/__init__.py delete mode 100644 python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py delete mode 100644 test/contrib/test_weight_decay_extend.py diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py index 7f8edf0c8cb..c1f884fdb4a 100644 --- a/python/paddle/fluid/contrib/__init__.py +++ b/python/paddle/fluid/contrib/__init__.py @@ -14,13 +14,9 @@ # limitations under the License. -from . import extend_optimizer -from .extend_optimizer import * - from . import optimizer from .optimizer import * __all__ = [] -__all__ += extend_optimizer.__all__ __all__ += optimizer.__all__ diff --git a/python/paddle/fluid/contrib/extend_optimizer/__init__.py b/python/paddle/fluid/contrib/extend_optimizer/__init__.py deleted file mode 100644 index 42511f66c41..00000000000 --- a/python/paddle/fluid/contrib/extend_optimizer/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import extend_optimizer_with_weight_decay -from .extend_optimizer_with_weight_decay import * - -__all__ = [] -__all__ += extend_optimizer_with_weight_decay.__all__ diff --git a/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py b/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py deleted file mode 100644 index 315c089ae10..00000000000 --- a/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import paddle.fluid -from paddle.fluid import framework as framework - -__all__ = ["extend_with_decoupled_weight_decay"] - - -class DecoupledWeightDecay: - def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs): - if not isinstance(coeff, float) and not isinstance( - coeff, framework.Variable - ): - raise TypeError("coeff should be float or Variable.") - self._params_name = set() - self._apply_decay_param_fun = apply_decay_param_fun - self._coeff = coeff - super().__init__(**kwargs) - - def _scale_parameters(self, params_and_grads): - """ - Adds weight decay ops. - scaled_parameter = parameter * coeff - - Args: - params_and_grads: A list of (parameters, gradients) pairs, - the parameters need to decay. - Raises: - Exception: The type of coeff and parameter is not consistent. - """ - if isinstance(self._coeff, float) and self._coeff == 0.0: - return - - scaled_params = [] - for param, grad in params_and_grads: - # If no gradient then we don't need to do anything - if grad is None: - continue - if ( - self._apply_decay_param_fun is not None - and not self._apply_decay_param_fun(param.name) - ): - continue - - if isinstance(self._coeff, float): - assert ( - param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32 - ), ( - "the type of coeff(float) and parameter(%s) is not consistent." - % (self._coeff.dtype) - ) - else: - assert self._coeff.dtype == param.dtype, ( - "the type of coeff(%s) and parameter(%s) is not consistent." - % (self._coeff.dtype, param.dtype) - ) - - with param.block.program._optimized_guard( - [param, grad] - ), framework.name_scope('weight decay'): - assert param.name not in self._params_name - scaled_params.append((param, grad, param * self._coeff)) - self._params_name.add(param.name) - return scaled_params - - def backward(self, **kargs): - return super().backward(**kargs) - - def apply_optimize(self, **kargs): - return super().apply_optimize(**kargs) - - def minimize( - self, loss, startup_program=None, parameter_list=None, no_grad_set=None - ): - params_grads = self.backward( - loss=loss, - startup_program=startup_program, - parameter_list=parameter_list, - no_grad_set=no_grad_set, - ) - scaled_params = self._scale_parameters(params_grads) - for p_grad_sgrad in scaled_params: - param, grad, scaled_param = p_grad_sgrad - with param.block.program._optimized_guard( - [param, grad] - ), framework.name_scope('weight decay'): - updated_param = paddle.subtract(x=param, y=scaled_param) - paddle.assign(updated_param, output=param) - - optimize_ops = self.apply_optimize( - loss=loss, - params_grads=params_grads, - startup_program=startup_program, - ) - return optimize_ops, params_grads - - def __str__(self): - return " ".join(["Weight Decay, params:", ",".join(self._params_name)]) - - -def extend_with_decoupled_weight_decay(base_optimizer): - """ - extend_with_decoupled_weight_decay is a decorator function, it returns an - optimizer class with decoupled weight decay. The returned optimizer will - apply weight decay on the optimized parameters with the parameters before - optimization, i.e: new_parameter = optimized_parameter - parameter * coeff. - The details of decoupled weight decay yplease refer to this - `DECOUPLED WEIGHT DECAY REGULARIZATION `_. - - Args: - base_optimizer (Optimizer): The base_optimizer should be a derived class of Optimizer. - - Returns: - OptimizerWithDecoupledWeightDecay: the optimizer with decouple weight decay. - - Examples: - - .. code-block:: python - - AdamW = fluid.contrib.extend_with_decoupled_weight_decay( - fluid.optimizer.Adam) - optimizer = AdamW(learning_rate=0.1, - weight_decay=0.01) - - optimizer.minimize(cost) - """ - if not issubclass(base_optimizer, paddle.fluid.optimizer.Optimizer): - raise TypeError( - "The input(base_optimizer) should be a derived class of Optimizer." - ) - - class OptimizerWithDecoupledWeightDecay( - DecoupledWeightDecay, base_optimizer - ): - """ - OptimizerWithDecoupledWeightDecay is used to update the optimized parameters - with the parameters before optimization. For more information, please refer: - https://arxiv.org/pdf/1711.05101.pdf. - - Args: - weight_decay (float|Variable): The weight decay coefficient, it can be - float or Variable. - apply_decay_param_fun (function|None): If it is not None, - only variables that makes apply_decay_param_fun(variable)==True - will be updated. It only works when we want to specify variables. - Default: None. - """ - - def __init__(self, weight_decay, apply_decay_param_fun=None, **kwargs): - super().__init__(weight_decay, apply_decay_param_fun, **kwargs) - - return OptimizerWithDecoupledWeightDecay diff --git a/python/setup.py.in b/python/setup.py.in index 609100f2e7a..60d2a6a1d6f 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -427,7 +427,6 @@ packages=['paddle', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', 'paddle.fluid.contrib', - 'paddle.fluid.contrib.extend_optimizer', 'paddle.fluid.incubate', 'paddle.incubate.distributed.fleet', 'paddle.fluid.incubate.checkpoint', diff --git a/setup.py b/setup.py index 7bdd5ea3726..d5de08adb24 100644 --- a/setup.py +++ b/setup.py @@ -1431,7 +1431,6 @@ def get_setup_parameters(): 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', 'paddle.fluid.contrib', - 'paddle.fluid.contrib.extend_optimizer', 'paddle.fluid.incubate', 'paddle.incubate.distributed.fleet', 'paddle.fluid.incubate.checkpoint', diff --git a/test/contrib/CMakeLists.txt b/test/contrib/CMakeLists.txt index 12f6bcffe88..7b241ef6fe9 100644 --- a/test/contrib/CMakeLists.txt +++ b/test/contrib/CMakeLists.txt @@ -20,5 +20,4 @@ py_test_modules( FLAGS_conv_workspace_size_limit=1000) set_tests_properties(test_image_classification_fp16 PROPERTIES TIMEOUT 120) -set_tests_properties(test_weight_decay_extend PROPERTIES TIMEOUT 120) set_tests_properties(test_multi_precision_fp16_train PROPERTIES TIMEOUT 120) diff --git a/test/contrib/test_weight_decay_extend.py b/test/contrib/test_weight_decay_extend.py deleted file mode 100644 index bcccec3b682..00000000000 --- a/test/contrib/test_weight_decay_extend.py +++ /dev/null @@ -1,219 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import contextlib -import unittest -from functools import partial - -import numpy as np - -import paddle -from paddle import fluid - -paddle.enable_static() - -SEED = 2020 - - -def fake_imdb_reader( - word_dict_size, - sample_num, - lower_seq_len=100, - upper_seq_len=200, - class_dim=2, -): - def __reader__(): - for _ in range(sample_num): - length = np.random.random_integers( - low=lower_seq_len, high=upper_seq_len, size=[1] - )[0] - ids = np.random.random_integers( - low=0, high=word_dict_size - 1, size=[length] - ).astype('int64') - label = np.random.random_integers( - low=0, high=class_dim - 1, size=[1] - ).astype('int64')[0] - yield ids, label - - return __reader__ - - -def get_places(): - places = [fluid.CPUPlace()] - if fluid.core.is_compiled_with_cuda(): - places.append(fluid.CUDAPlace(0)) - return places - - -@contextlib.contextmanager -def prog_scope_guard(main_prog, startup_prog): - scope = fluid.core.Scope() - with fluid.unique_name.guard(): - with fluid.scope_guard(scope): - with fluid.program_guard(main_prog, startup_prog): - yield - - -def bow_net( - data, - label, - dict_dim, - is_sparse=False, - emb_dim=128, - hid_dim=128, - hid_dim2=96, - class_dim=2, -): - """ - BOW net - This model is from https://github.com/PaddlePaddle/models: - fluid/PaddleNLP/text_classification/nets.py - """ - emb = fluid.layers.embedding( - input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim] - ) - bow = paddle.static.nn.sequence_lod.sequence_pool( - input=emb, pool_type='sum' - ) - bow_tanh = paddle.tanh(bow) - fc_1 = paddle.static.nn.fc(x=bow_tanh, size=hid_dim, activation="tanh") - fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim2, activation="tanh") - prediction = paddle.static.nn.fc( - x=[fc_2], size=class_dim, activation="softmax" - ) - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=label, reduction='none', use_softmax=False - ) - avg_cost = paddle.mean(x=cost) - - return avg_cost - - -class TestWeightDecay(unittest.TestCase): - def setUp(self): - # set seed - np.random.seed(SEED) - paddle.seed(SEED) - paddle.framework.random._manual_program_seed(SEED) - # configs - self.word_dict_len = 5147 - batch_size = 2 - reader = fake_imdb_reader(self.word_dict_len, batch_size * 100) - reader = paddle.batch(reader, batch_size=batch_size)() - self.train_data = [next(reader) for _ in range(3)] - self.learning_rate = 0.5 - - def run_program(self, place, feed_list): - exe = fluid.Executor(place) - feeder = fluid.DataFeeder(feed_list=feed_list, place=place) - exe.run(fluid.default_startup_program()) - - main_prog = fluid.default_main_program() - param_list = [var.name for var in main_prog.block(0).all_parameters()] - - param_sum = [] - for data in self.train_data: - out = exe.run( - main_prog, feed=feeder.feed(data), fetch_list=param_list - ) - p_sum = 0 - for v in out: - p_sum += np.sum(np.abs(v)) - param_sum.append(p_sum) - return param_sum - - def check_weight_decay(self, place, model): - main_prog = fluid.framework.Program() - startup_prog = fluid.framework.Program() - - with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog): - data = paddle.static.data( - name="words", shape=[-1, 1], dtype="int64", lod_level=1 - ) - label = paddle.static.data( - name="label", shape=[-1, 1], dtype="int64" - ) - avg_cost = model(data, label, self.word_dict_len) - AdamW = fluid.contrib.extend_with_decoupled_weight_decay( - fluid.optimizer.Adam - ) - - optimizer = AdamW( - learning_rate=self.learning_rate, - weight_decay=self.learning_rate, - ) - - optimizer.minimize(avg_cost) - param_sum = self.run_program(place, [data, label]) - - return param_sum - - def check_weight_decay2(self, place, model): - main_prog = fluid.framework.Program() - startup_prog = fluid.framework.Program() - - with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog): - data = paddle.static.data( - name="words", shape=[-1, 1], dtype="int64", lod_level=1 - ) - label = paddle.static.data( - name="label", shape=[-1, 1], dtype="int64" - ) - - avg_cost = model(data, label, self.word_dict_len) - - optimizer = fluid.optimizer.Adam(learning_rate=self.learning_rate) - - params_grads = optimizer.backward(avg_cost) - - param_list = [ - (var, var * self.learning_rate) - for var in main_prog.block(0).all_parameters() - ] - - for params in param_list: - updated_p = paddle.subtract(x=params[0], y=params[1]) - paddle.assign(updated_p, output=params[0]) - - optimizer.apply_optimize(avg_cost, startup_prog, params_grads) - - param_sum = self.run_program(place, [data, label]) - return param_sum - - def test_weight_decay(self): - for place in get_places(): - model = partial(bow_net, is_sparse=False) - param_sum1 = self.check_weight_decay(place, model) - param_sum2 = self.check_weight_decay2(place, model) - - for i in range(len(param_sum1)): - np.testing.assert_allclose( - param_sum1[i], - param_sum2[i], - rtol=1e-05, - err_msg='Current place: {}, i: {}, sum1: {}, sum2: {}'.format( - place, - i, - param_sum1[i][ - ~np.isclose(param_sum1[i], param_sum2[i]) - ], - param_sum2[i][ - ~np.isclose(param_sum1[i], param_sum2[i]) - ], - ), - ) - - -if __name__ == '__main__': - unittest.main() -- GitLab