test_weight_decay_extend.py 6.8 KB
Newer Older
C
chengduo 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import contextlib
C
chengduo 已提交
16 17
import unittest
from functools import partial
18

C
chengduo 已提交
19
import numpy as np
20

C
chengduo 已提交
21
import paddle
22
from paddle import fluid
C
chengduo 已提交
23

P
pangyoki 已提交
24 25
paddle.enable_static()

26 27
SEED = 2020

C
chengduo 已提交
28

29 30 31 32 33 34 35
def fake_imdb_reader(
    word_dict_size,
    sample_num,
    lower_seq_len=100,
    upper_seq_len=200,
    class_dim=2,
):
36
    def __reader__():
37
        for _ in range(sample_num):
38 39 40 41 42 43 44 45 46
            length = np.random.random_integers(
                low=lower_seq_len, high=upper_seq_len, size=[1]
            )[0]
            ids = np.random.random_integers(
                low=0, high=word_dict_size - 1, size=[length]
            ).astype('int64')
            label = np.random.random_integers(
                low=0, high=class_dim - 1, size=[1]
            ).astype('int64')[0]
47 48 49 50 51
            yield ids, label

    return __reader__


C
chengduo 已提交
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
def get_places():
    places = [fluid.CPUPlace()]
    if fluid.core.is_compiled_with_cuda():
        places.append(fluid.CUDAPlace(0))
    return places


@contextlib.contextmanager
def prog_scope_guard(main_prog, startup_prog):
    scope = fluid.core.Scope()
    with fluid.unique_name.guard():
        with fluid.scope_guard(scope):
            with fluid.program_guard(main_prog, startup_prog):
                yield


68 69 70 71 72 73 74 75 76 77
def bow_net(
    data,
    label,
    dict_dim,
    is_sparse=False,
    emb_dim=128,
    hid_dim=128,
    hid_dim2=96,
    class_dim=2,
):
C
chengduo 已提交
78 79 80 81 82
    """
    BOW net
    This model is from https://github.com/PaddlePaddle/models:
    fluid/PaddleNLP/text_classification/nets.py
    """
83 84 85
    emb = fluid.layers.embedding(
        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]
    )
86 87 88
    bow = paddle.static.nn.sequence_lod.sequence_pool(
        input=emb, pool_type='sum'
    )
89
    bow_tanh = paddle.tanh(bow)
C
Charles-hit 已提交
90 91 92 93 94
    fc_1 = paddle.static.nn.fc(x=bow_tanh, size=hid_dim, activation="tanh")
    fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim2, activation="tanh")
    prediction = paddle.static.nn.fc(
        x=[fc_2], size=class_dim, activation="softmax"
    )
95 96 97
    cost = paddle.nn.functional.cross_entropy(
        input=prediction, label=label, reduction='none', use_softmax=False
    )
98
    avg_cost = paddle.mean(x=cost)
C
chengduo 已提交
99 100 101 102 103 104

    return avg_cost


class TestWeightDecay(unittest.TestCase):
    def setUp(self):
105 106 107 108 109
        # set seed
        np.random.seed(SEED)
        paddle.seed(SEED)
        paddle.framework.random._manual_program_seed(SEED)
        # configs
110 111 112 113 114
        self.word_dict_len = 5147
        batch_size = 2
        reader = fake_imdb_reader(self.word_dict_len, batch_size * 100)
        reader = paddle.batch(reader, batch_size=batch_size)()
        self.train_data = [next(reader) for _ in range(3)]
115
        self.learning_rate = 0.5
C
chengduo 已提交
116 117 118 119 120 121 122 123 124 125 126

    def run_program(self, place, feed_list):
        exe = fluid.Executor(place)
        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
        exe.run(fluid.default_startup_program())

        main_prog = fluid.default_main_program()
        param_list = [var.name for var in main_prog.block(0).all_parameters()]

        param_sum = []
        for data in self.train_data:
127 128 129
            out = exe.run(
                main_prog, feed=feeder.feed(data), fetch_list=param_list
            )
C
chengduo 已提交
130 131 132 133 134 135 136 137 138
            p_sum = 0
            for v in out:
                p_sum += np.sum(np.abs(v))
            param_sum.append(p_sum)
        return param_sum

    def check_weight_decay(self, place, model):
        main_prog = fluid.framework.Program()
        startup_prog = fluid.framework.Program()
L
Leo Chen 已提交
139

C
chengduo 已提交
140
        with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
G
GGBond8488 已提交
141 142 143 144 145
            data = paddle.static.data(
                name="words", shape=[-1, 1], dtype="int64", lod_level=1
            )
            label = paddle.static.data(
                name="label", shape=[-1, 1], dtype="int64"
146
            )
147
            avg_cost = model(data, label, self.word_dict_len)
C
chengduo 已提交
148
            AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
149 150
                fluid.optimizer.Adam
            )
C
chengduo 已提交
151

152 153 154 155
            optimizer = AdamW(
                learning_rate=self.learning_rate,
                weight_decay=self.learning_rate,
            )
C
chengduo 已提交
156 157 158 159 160 161 162 163 164

            optimizer.minimize(avg_cost)
            param_sum = self.run_program(place, [data, label])

        return param_sum

    def check_weight_decay2(self, place, model):
        main_prog = fluid.framework.Program()
        startup_prog = fluid.framework.Program()
L
Leo Chen 已提交
165

C
chengduo 已提交
166
        with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
G
GGBond8488 已提交
167 168 169 170 171
            data = paddle.static.data(
                name="words", shape=[-1, 1], dtype="int64", lod_level=1
            )
            label = paddle.static.data(
                name="label", shape=[-1, 1], dtype="int64"
172
            )
C
chengduo 已提交
173

174
            avg_cost = model(data, label, self.word_dict_len)
C
chengduo 已提交
175

176 177 178 179
            optimizer = fluid.optimizer.Adam(learning_rate=self.learning_rate)

            params_grads = optimizer.backward(avg_cost)

180 181 182 183
            param_list = [
                (var, var * self.learning_rate)
                for var in main_prog.block(0).all_parameters()
            ]
C
chengduo 已提交
184 185

            for params in param_list:
186
                updated_p = paddle.subtract(x=params[0], y=params[1])
187
                paddle.assign(updated_p, output=params[0])
C
chengduo 已提交
188

189 190
            optimizer.apply_optimize(avg_cost, startup_prog, params_grads)

C
chengduo 已提交
191 192 193 194 195 196 197 198 199 200
            param_sum = self.run_program(place, [data, label])
        return param_sum

    def test_weight_decay(self):
        for place in get_places():
            model = partial(bow_net, is_sparse=False)
            param_sum1 = self.check_weight_decay(place, model)
            param_sum2 = self.check_weight_decay2(place, model)

            for i in range(len(param_sum1)):
201 202 203 204
                np.testing.assert_allclose(
                    param_sum1[i],
                    param_sum2[i],
                    rtol=1e-05,
205 206 207 208 209 210 211 212 213 214 215
                    err_msg='Current place: {}, i: {}, sum1: {}, sum2: {}'.format(
                        place,
                        i,
                        param_sum1[i][
                            ~np.isclose(param_sum1[i], param_sum2[i])
                        ],
                        param_sum2[i][
                            ~np.isclose(param_sum1[i], param_sum2[i])
                        ],
                    ),
                )
C
chengduo 已提交
216 217 218 219


if __name__ == '__main__':
    unittest.main()