remove the extend_optimizer_with_weight_decay function (#55007)

* remove the extend_optimizer_with_weight_decay function * Update __init__.py * fix bug * fix bug

remove the extend_optimizer_with_weight_decay function (#55007)
* remove the extend_optimizer_with_weight_decay function * Update __init__.py * fix bug * fix bug
3cca2a87 · LoneRanger · GitHub · b3c26de8 · 3cca2a87 · b3c26de8
7 changed file
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -14,13 +14,9 @@
 # limitations under the License.
-from . import extend_optimizer
-from .extend_optimizer import *
 from . import optimizer
 from .optimizer import *
 __all__ = []
-__all__ += extend_optimizer.__all__
 __all__ += optimizer.__all__
--- a/python/paddle/fluid/contrib/extend_optimizer/__init__.py
+++ b/python/paddle/fluid/contrib/extend_optimizer/__init__.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from . import extend_optimizer_with_weight_decay
-from .extend_optimizer_with_weight_decay import *
-__all__ = []
-__all__ += extend_optimizer_with_weight_decay.__all__
--- a/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
+++ b/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle.fluid
-from paddle.fluid import framework as framework
-__all__ = ["extend_with_decoupled_weight_decay"]
-class DecoupledWeightDecay:
-    def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs):
-        if not isinstance(coeff, float) and not isinstance(
-            coeff, framework.Variable
-        ):
-            raise TypeError("coeff should be float or Variable.")
-        self._params_name = set()
-        self._apply_decay_param_fun = apply_decay_param_fun
-        self._coeff = coeff
-        super().__init__(**kwargs)
-    def _scale_parameters(self, params_and_grads):
-        """
-        Adds weight decay ops.
-            scaled_parameter = parameter * coeff
-        Args:
-            params_and_grads: A list of (parameters, gradients) pairs,
-                the parameters need to decay.
-        Raises:
-            Exception: The type of coeff and parameter is not consistent.
-        """
-        if isinstance(self._coeff, float) and self._coeff == 0.0:
-            return
-        scaled_params = []
-        for param, grad in params_and_grads:
-            # If no gradient then we don't need to do anything
-            if grad is None:
-                continue
-            if (
-                self._apply_decay_param_fun is not None
-                and not self._apply_decay_param_fun(param.name)
-            ):
-                continue
-            if isinstance(self._coeff, float):
-                assert (
-                    param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32
-                ), (
-                    "the type of coeff(float) and parameter(%s) is not consistent."
-                    % (self._coeff.dtype)
-                )
-            else:
-                assert self._coeff.dtype == param.dtype, (
-                    "the type of coeff(%s) and parameter(%s) is not consistent."
-                    % (self._coeff.dtype, param.dtype)
-                )
-            with param.block.program._optimized_guard(
-                [param, grad]
-            ), framework.name_scope('weight decay'):
-                assert param.name not in self._params_name
-                scaled_params.append((param, grad, param * self._coeff))
-                self._params_name.add(param.name)
-        return scaled_params
-    def backward(self, **kargs):
-        return super().backward(**kargs)
-    def apply_optimize(self, **kargs):
-        return super().apply_optimize(**kargs)
-    def minimize(
-        self, loss, startup_program=None, parameter_list=None, no_grad_set=None
-    ):
-        params_grads = self.backward(
-            loss=loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set,
-        )
-        scaled_params = self._scale_parameters(params_grads)
-        for p_grad_sgrad in scaled_params:
-            param, grad, scaled_param = p_grad_sgrad
-            with param.block.program._optimized_guard(
-                [param, grad]
-            ), framework.name_scope('weight decay'):
-                updated_param = paddle.subtract(x=param, y=scaled_param)
-                paddle.assign(updated_param, output=param)
-        optimize_ops = self.apply_optimize(
-            loss=loss,
-            params_grads=params_grads,
-            startup_program=startup_program,
-        )
-        return optimize_ops, params_grads
-    def __str__(self):
-        return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
-def extend_with_decoupled_weight_decay(base_optimizer):
-    """
-    extend_with_decoupled_weight_decay is a decorator function, it returns an
-    optimizer class with decoupled weight decay. The returned optimizer will
-    apply weight decay on the optimized parameters with the parameters before
-    optimization, i.e: new_parameter = optimized_parameter - parameter * coeff.
-    The details of decoupled weight decay yplease refer to this
-    `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
-    Args:
-        base_optimizer (Optimizer): The base_optimizer should be a derived class of Optimizer.
-    Returns:
-        OptimizerWithDecoupledWeightDecay: the optimizer with decouple weight decay.
-    Examples:
-      .. code-block:: python
-        AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
-            fluid.optimizer.Adam)
-        optimizer = AdamW(learning_rate=0.1,
-                          weight_decay=0.01)
-        optimizer.minimize(cost)
-    """
-    if not issubclass(base_optimizer, paddle.fluid.optimizer.Optimizer):
-        raise TypeError(
-            "The input(base_optimizer) should be a derived class of Optimizer."
-        )
-    class OptimizerWithDecoupledWeightDecay(
-        DecoupledWeightDecay, base_optimizer
-    ):
-        """
-        OptimizerWithDecoupledWeightDecay is used to update the optimized parameters
-        with the parameters before optimization. For more information, please refer:
-        https://arxiv.org/pdf/1711.05101.pdf.
-        Args:
-            weight_decay (float|Variable): The weight decay coefficient, it can be
-                float or Variable.
-            apply_decay_param_fun (function|None): If it is not None,
-                only variables that makes apply_decay_param_fun(variable)==True
-                will be updated. It only works when we want to specify variables.
-                Default: None.
-        """
-        def __init__(self, weight_decay, apply_decay_param_fun=None, **kwargs):
-            super().__init__(weight_decay, apply_decay_param_fun, **kwargs)
-    return OptimizerWithDecoupledWeightDecay
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -427,7 +427,6 @@ packages=['paddle',
          'paddle.fluid.proto.profiler',
          'paddle.fluid.layers',
          'paddle.fluid.contrib',
-          'paddle.fluid.contrib.extend_optimizer',
          'paddle.fluid.incubate',
          'paddle.incubate.distributed.fleet',
          'paddle.fluid.incubate.checkpoint',

--- a/setup.py
+++ b/setup.py
@@ -1431,7 +1431,6 @@ def get_setup_parameters():
        'paddle.fluid.proto.profiler',
        'paddle.fluid.layers',
        'paddle.fluid.contrib',
-        'paddle.fluid.contrib.extend_optimizer',
        'paddle.fluid.incubate',
        'paddle.incubate.distributed.fleet',
        'paddle.fluid.incubate.checkpoint',

--- a/test/contrib/CMakeLists.txt
+++ b/test/contrib/CMakeLists.txt
@@ -20,5 +20,4 @@ py_test_modules(
  FLAGS_conv_workspace_size_limit=1000)
 set_tests_properties(test_image_classification_fp16 PROPERTIES TIMEOUT 120)
-set_tests_properties(test_weight_decay_extend PROPERTIES TIMEOUT 120)
 set_tests_properties(test_multi_precision_fp16_train PROPERTIES TIMEOUT 120)
--- a/test/contrib/test_weight_decay_extend.py
+++ b/test/contrib/test_weight_decay_extend.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import contextlib
-import unittest
-from functools import partial
-import numpy as np
-import paddle
-from paddle import fluid
-paddle.enable_static()
-SEED = 2020
-def fake_imdb_reader(
-    word_dict_size,
-    sample_num,
-    lower_seq_len=100,
-    upper_seq_len=200,
-    class_dim=2,
-):
-    def __reader__():
-        for _ in range(sample_num):
-            length = np.random.random_integers(
-                low=lower_seq_len, high=upper_seq_len, size=[1]
-            )[0]
-            ids = np.random.random_integers(
-                low=0, high=word_dict_size - 1, size=[length]
-            ).astype('int64')
-            label = np.random.random_integers(
-                low=0, high=class_dim - 1, size=[1]
-            ).astype('int64')[0]
-            yield ids, label
-    return __reader__
-def get_places():
-    places = [fluid.CPUPlace()]
-    if fluid.core.is_compiled_with_cuda():
-        places.append(fluid.CUDAPlace(0))
-    return places
-@contextlib.contextmanager
-def prog_scope_guard(main_prog, startup_prog):
-    scope = fluid.core.Scope()
-    with fluid.unique_name.guard():
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(main_prog, startup_prog):
-                yield
-def bow_net(
-    data,
-    label,
-    dict_dim,
-    is_sparse=False,
-    emb_dim=128,
-    hid_dim=128,
-    hid_dim2=96,
-    class_dim=2,
-):
-    """
-    BOW net
-    This model is from https://github.com/PaddlePaddle/models:
-    fluid/PaddleNLP/text_classification/nets.py
-    """
-    emb = fluid.layers.embedding(
-        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]
-    )
-    bow = paddle.static.nn.sequence_lod.sequence_pool(
-        input=emb, pool_type='sum'
-    )
-    bow_tanh = paddle.tanh(bow)
-    fc_1 = paddle.static.nn.fc(x=bow_tanh, size=hid_dim, activation="tanh")
-    fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim2, activation="tanh")
-    prediction = paddle.static.nn.fc(
-        x=[fc_2], size=class_dim, activation="softmax"
-    )
-    cost = paddle.nn.functional.cross_entropy(
-        input=prediction, label=label, reduction='none', use_softmax=False
-    )
-    avg_cost = paddle.mean(x=cost)
-    return avg_cost
-class TestWeightDecay(unittest.TestCase):
-    def setUp(self):
-        # set seed
-        np.random.seed(SEED)
-        paddle.seed(SEED)
-        paddle.framework.random._manual_program_seed(SEED)
-        # configs
-        self.word_dict_len = 5147
-        batch_size = 2
-        reader = fake_imdb_reader(self.word_dict_len, batch_size * 100)
-        reader = paddle.batch(reader, batch_size=batch_size)()
-        self.train_data = [next(reader) for _ in range(3)]
-        self.learning_rate = 0.5
-    def run_program(self, place, feed_list):
-        exe = fluid.Executor(place)
-        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
-        exe.run(fluid.default_startup_program())
-        main_prog = fluid.default_main_program()
-        param_list = [var.name for var in main_prog.block(0).all_parameters()]
-        param_sum = []
-        for data in self.train_data:
-            out = exe.run(
-                main_prog, feed=feeder.feed(data), fetch_list=param_list
-            )
-            p_sum = 0
-            for v in out:
-                p_sum += np.sum(np.abs(v))
-            param_sum.append(p_sum)
-        return param_sum
-    def check_weight_decay(self, place, model):
-        main_prog = fluid.framework.Program()
-        startup_prog = fluid.framework.Program()
-        with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
-            data = paddle.static.data(
-                name="words", shape=[-1, 1], dtype="int64", lod_level=1
-            )
-            label = paddle.static.data(
-                name="label", shape=[-1, 1], dtype="int64"
-            )
-            avg_cost = model(data, label, self.word_dict_len)
-            AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
-                fluid.optimizer.Adam
-            )
-            optimizer = AdamW(
-                learning_rate=self.learning_rate,
-                weight_decay=self.learning_rate,
-            )
-            optimizer.minimize(avg_cost)
-            param_sum = self.run_program(place, [data, label])
-        return param_sum
-    def check_weight_decay2(self, place, model):
-        main_prog = fluid.framework.Program()
-        startup_prog = fluid.framework.Program()
-        with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
-            data = paddle.static.data(
-                name="words", shape=[-1, 1], dtype="int64", lod_level=1
-            )
-            label = paddle.static.data(
-                name="label", shape=[-1, 1], dtype="int64"
-            )
-            avg_cost = model(data, label, self.word_dict_len)
-            optimizer = fluid.optimizer.Adam(learning_rate=self.learning_rate)
-            params_grads = optimizer.backward(avg_cost)
-            param_list = [
-                (var, var * self.learning_rate)
-                for var in main_prog.block(0).all_parameters()
-            ]
-            for params in param_list:
-                updated_p = paddle.subtract(x=params[0], y=params[1])
-                paddle.assign(updated_p, output=params[0])
-            optimizer.apply_optimize(avg_cost, startup_prog, params_grads)
-            param_sum = self.run_program(place, [data, label])
-        return param_sum
-    def test_weight_decay(self):
-        for place in get_places():
-            model = partial(bow_net, is_sparse=False)
-            param_sum1 = self.check_weight_decay(place, model)
-            param_sum2 = self.check_weight_decay2(place, model)
-            for i in range(len(param_sum1)):
-                np.testing.assert_allclose(
-                    param_sum1[i],
-                    param_sum2[i],
-                    rtol=1e-05,
-                    err_msg='Current place: {}, i: {}, sum1: {}, sum2: {}'.format(
-                        place,
-                        i,
-                        param_sum1[i][
-                            ~np.isclose(param_sum1[i], param_sum2[i])
-                        ],
-                        param_sum2[i][
-                            ~np.isclose(param_sum1[i], param_sum2[i])
-                        ],
-                    ),
-                )
-if __name__ == '__main__':
-    unittest.main()