test_learning_rate_scheduler.py 21.1 KB
Newer Older
Q
Qiao Longfei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import copy
16 17
import math
import unittest
18 19 20

import numpy as np

21
import paddle
22
from paddle import fluid
D
Difer 已提交
23
from paddle.fluid import core, framework
Q
Qiao Longfei 已提交
24 25


26 27 28
def exponential_decay(
    learning_rate, global_step, decay_steps, decay_rate, staircase=False
):
Y
Yu Yang 已提交
29
    exponent = global_step / decay_steps
Q
Qiao Longfei 已提交
30 31 32 33 34
    if staircase:
        exponent = math.floor(exponent)
    return learning_rate * decay_rate**exponent


35 36 37
def natural_exp_decay(
    learning_rate, global_step, decay_steps, decay_rate, staircase=False
):
Q
Qiao Longfei 已提交
38 39 40 41 42 43
    exponent = float(global_step) / float(decay_steps)
    if staircase:
        exponent = math.floor(exponent)
    return learning_rate * math.exp(-1 * decay_rate * exponent)


44 45 46
def inverse_time_decay(
    learning_rate, global_step, decay_steps, decay_rate, staircase=False
):
Q
Qiao Longfei 已提交
47 48 49 50 51 52
    temp = float(global_step) / float(decay_steps)
    if staircase:
        temp = math.floor(temp)
    return learning_rate / (1 + decay_rate * temp)


53 54 55 56 57 58 59 60
def polynomial_decay(
    learning_rate,
    global_step,
    decay_steps,
    end_learning_rate=0.0001,
    power=1.0,
    cycle=False,
):
61 62 63 64 65 66 67
    if cycle:
        div = math.ceil(global_step / float(decay_steps))
        if div == 0:
            div = 1
        decay_steps = decay_steps * div
    else:
        global_step = min(global_step, decay_steps)
68 69 70
    return (learning_rate - end_learning_rate) * (
        (1 - float(global_step) / float(decay_steps)) ** power
    ) + end_learning_rate
71 72 73 74 75 76 77 78


def piecewise_decay(global_step, boundaries, values):
    assert len(boundaries) + 1 == len(values)
    for i in range(len(boundaries)):
        if global_step < boundaries[i]:
            return values[i]
    return values[len(values) - 1]
Q
Qiao Longfei 已提交
79

80

S
shippingwang 已提交
81 82
def cosine_decay(global_step, learning_rate, step_each_epoch, epochs):
    cur_epoch = math.floor(global_step / step_each_epoch)
83 84 85
    decayed_lr = (
        learning_rate * 0.5 * (math.cos(cur_epoch * math.pi / epochs) + 1)
    )
S
shippingwang 已提交
86 87 88
    return decayed_lr


89 90 91 92 93 94 95 96
def noam_decay(global_step, d_model, warmup_steps, learning_rate=1.0):
    a = math.pow(global_step, -0.5)
    b = math.pow(warmup_steps, -1.5) * global_step
    decayed_lr = learning_rate * math.pow(d_model, -0.5) * min(a, b)

    return decayed_lr


97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
def linear_lr_warmup(global_step, warmup_steps, start_lr, end_lr):
    linear_step = end_lr - start_lr
    decayed_lr = start_lr + linear_step * (global_step / warmup_steps)
    return decayed_lr


def multi_step_decay(global_step, learning_rate, milestones, decay_rate=0.1):
    for i in range(len(milestones)):
        if global_step < milestones[i]:
            return learning_rate * math.pow(decay_rate, i)

    return learning_rate * math.pow(decay_rate, len(milestones))


def step_decay(global_step, learning_rate, step_size, decay_rate=0.1):
    return learning_rate * math.pow(decay_rate, global_step // step_size)


115 116 117 118
def lambda_decay(global_step, learning_rate, lr_lambda):
    return learning_rate * lr_lambda(global_step)


119
class TestLearningRateDecayDygraph(unittest.TestCase):
120 121 122
    def test_LR_state_dict(self):
        with fluid.dygraph.guard():
            x = np.random.uniform(-1, 1, [3, 10]).astype("float32")
123
            linear = paddle.nn.Linear(10, 10)
124 125
            input = fluid.dygraph.to_variable(x)

126
            Exponential_scheduler = paddle.optimizer.lr.ExponentialDecay(
127
                learning_rate=0.1,
128
                gamma=0.5,
129
            )
130
            Step_scheduler = paddle.optimizer.lr.StepDecay(0.5, step_size=3)
131 132
            Reducelr_scheduler = paddle.optimizer.lr.ReduceOnPlateau(
                learning_rate=1.0, factor=0.5, patience=5, cooldown=3
133 134
            )

L
LoneRanger 已提交
135
            adam1 = paddle.optimizer.Adam(
136
                learning_rate=Exponential_scheduler,
L
LoneRanger 已提交
137
                parameters=linear.parameters(),
138
            )
L
LoneRanger 已提交
139 140
            adam2 = paddle.optimizer.Adam(
                learning_rate=Step_scheduler, parameters=linear.parameters()
141
            )
L
LoneRanger 已提交
142
            adam3 = paddle.optimizer.Adam(
143
                learning_rate=Reducelr_scheduler,
L
LoneRanger 已提交
144
                parameters=linear.parameters(),
145
            )
146 147 148 149
            print(adam3.state_dict())

            for epoch in range(10):
                out = linear(input)
150
                loss = paddle.mean(out)
151 152 153 154 155 156
                loss.backward()
                adam1.minimize(loss)
                adam2.minimize(loss)
                adam3.minimize(loss)
                linear.clear_gradients()

157
                Step_scheduler.get_lr()
158 159
                Reducelr_scheduler.step(loss)

160
            paddle.save(linear.state_dict(), "save_path.pdparams")
161

162
            Exponential_scheduler_test = paddle.optimizer.lr.ExponentialDecay(
163
                learning_rate=0.1,
164
                gamma=0.5,
165
            )
166 167 168
            Step_scheduler_test = paddle.optimizer.lr.StepDecay(
                0.5, step_size=3
            )
169 170
            Reducelr_scheduler_test = paddle.optimizer.lr.ReduceOnPlateau(
                learning_rate=1.0, factor=0.5, patience=5, cooldown=3
171
            )
172

173 174
            paddle.save(adam1.state_dict(), "save_path.pdopt")
            opt_state = paddle.load("save_path.pdopt")
L
LoneRanger 已提交
175
            adam_test = paddle.optimizer.Adam(
176
                learning_rate=Exponential_scheduler_test,
L
LoneRanger 已提交
177
                parameters=linear.parameters(),
178
            )
L
LoneRanger 已提交
179
            adam_test.set_state_dict(opt_state)
180
            self.assertEqual(
181 182
                adam_test._learning_rate.last_epoch,
                adam1._learning_rate.last_epoch,
L
LoneRanger 已提交
183
                "last_epoch is different before and after set_state_dict",
184
            )
185

186 187
            paddle.save(adam2.state_dict(), "save_path.pdopt")
            opt_state = paddle.load("save_path.pdopt")
L
LoneRanger 已提交
188
            adam_test = paddle.optimizer.Adam(
189
                learning_rate=Step_scheduler_test,
L
LoneRanger 已提交
190
                parameters=linear.parameters(),
191
            )
L
LoneRanger 已提交
192
            adam_test.set_state_dict(opt_state)
193
            self.assertEqual(
194 195
                adam_test._learning_rate.last_epoch,
                adam2._learning_rate.last_epoch,
L
LoneRanger 已提交
196
                "epoch_num is different before and after set_state_dict",
197
            )
198
            self.assertEqual(
199 200
                adam_test._learning_rate(),
                adam2._learning_rate(),
L
LoneRanger 已提交
201
                "current learning rate is different before and after set_state_dict",
202
            )
203

204 205
            paddle.save(adam3.state_dict(), "save_path.pdopt")
            opt_state = paddle.load("save_path.pdopt")
L
LoneRanger 已提交
206
            adam_test = paddle.optimizer.Adam(
207
                learning_rate=Reducelr_scheduler_test,
L
LoneRanger 已提交
208
                parameters=linear.parameters(),
209
            )
L
LoneRanger 已提交
210
            adam_test.set_state_dict(opt_state)
211
            self.assertEqual(
212 213
                adam_test._learning_rate.best,
                adam3._learning_rate.best,
L
LoneRanger 已提交
214
                "best_loss is different before and after set_state_dict",
215
            )
216 217 218
            self.assertEqual(
                adam_test._learning_rate.cooldown_counter,
                adam3._learning_rate.cooldown_counter,
L
LoneRanger 已提交
219
                "cooldown_counter is different before and after set_state_dict",
220
            )
221 222 223
            self.assertEqual(
                adam_test._learning_rate.num_bad_epochs,
                adam3._learning_rate.num_bad_epochs,
L
LoneRanger 已提交
224
                "num_bad_epochs is different before and after set_state_dict",
225 226
            )
            self.assertEqual(
227 228
                adam_test._learning_rate.last_epoch,
                adam3._learning_rate.last_epoch,
L
LoneRanger 已提交
229
                "epoch is different before and after set_state_dict",
230
            )
231
            self.assertEqual(
232 233
                adam_test._learning_rate(),
                adam3._learning_rate(),
L
LoneRanger 已提交
234
                "current learning rate is different before and after set_state_dict",
235
            )
236

237
    def test_NoamDecay(self):
238 239 240 241
        with fluid.dygraph.guard():
            d_model = 0.01
            warmup_steps = 200
            learning_rate = 2.0
D
Difer 已提交
242 243 244
            lr = paddle.optimizer.lr.noam_decay(
                d_model, warmup_steps, learning_rate
            )
245 246
            for step in range(5):
                step += 1
247 248 249
                right_result = noam_decay(
                    step, d_model, warmup_steps, learning_rate
                )
250
                lr.step()
251 252 253 254
                fluid_result = lr()

                self.assertAlmostEqual(
                    right_result,
255
                    fluid_result,
256
                    msg='Failed lr scheduler in step {}, Python result is {}, Fluid result is {}'.format(
257
                        step, right_result, fluid_result
258 259
                    ),
                )
260

261 262
    def test_LinearLrWarmup(self):
        with fluid.dygraph.guard():
263
            lr = paddle.optimizer.lr.PolynomialDecay(
264 265
                learning_rate=1.0,
                decay_steps=10,
266
                end_lr=0.0,
267 268
                power=1.0,
            )
269 270
            lr.step()
            lr = paddle.optimizer.lr.LinearWarmup(
271 272
                learning_rate=lr, warmup_steps=2, start_lr=0.0, end_lr=1.0
            )
273
            lr.step()
274 275
            right_result = [0.5, 0.9, 0.8, 0.7, 0.6]
            for i in range(5):
276 277
                if i == 1:
                    lr.step()
278
                t = lr()
279 280
                lr.step()
                np.testing.assert_allclose(t, right_result[i], rtol=1e-05)
281 282

            with self.assertRaises(TypeError):
D
Difer 已提交
283
                lr = paddle.optimizer.lr.linear_lr_warmup(
284 285 286 287 288
                    learning_rate="fake_lr",
                    warmup_steps=2,
                    start_lr=0.0,
                    end_lr=1.0,
                )
289 290 291 292 293 294

    def test_MultiStepDecay(self):
        with fluid.dygraph.guard():
            learning_rate = 0.5
            milestones = [2, 4, 8]
            decay_rate = 0.2
295
            linear = paddle.nn.Linear(10, 10)
296

297
            scheduler = paddle.optimizer.lr.MultiStepDecay(
298 299
                learning_rate, milestones, decay_rate
            )
300

301 302
            adam = paddle.optimizer.Adam(
                learning_rate=scheduler, parameters=linear.parameters()
303
            )
304
            for epoch in range(10):
305 306 307
                right_result = multi_step_decay(
                    epoch, learning_rate, milestones, decay_rate
                )
308 309 310
                fluid_result = adam.get_lr()
                adam.step()
                scheduler.step()
311 312 313
                self.assertAlmostEqual(
                    right_result,
                    fluid_result,
314
                    msg='Failed lr scheduler in epoch {}, Python result is {}, Fluid result is {}'.format(
315 316 317
                        epoch, right_result, fluid_result
                    ),
                )
318 319

            with self.assertRaises(ValueError):
320
                lr = paddle.optimizer.lr.MultiStepDecay(
321 322
                    learning_rate, [30, 50, 20], 0.1
                )
323 324

            with self.assertRaises(ValueError):
325
                lr = paddle.optimizer.lr.MultiStepDecay(
326 327
                    learning_rate, [20, 30, 50], 1
                )
328 329

            with self.assertRaises(TypeError):
330
                lr = paddle.optimizer.lr.MultiStepDecay("test", [20, 30, 50])
331 332

            with self.assertRaises(ValueError):
333
                lr = paddle.optimizer.lr.MultiStepDecay(-1, [20, 30, 50])
334 335 336 337 338 339

    def test_StepDecay(self):
        with fluid.dygraph.guard():
            learning_rate = 0.5
            step_size = 3
            decay_rate = 0.2
340
            scheduler = paddle.optimizer.lr.StepDecay(
341 342
                learning_rate, step_size, decay_rate
            )
343
            for epoch in range(10):
344 345 346
                right_result = step_decay(
                    epoch, learning_rate, step_size, decay_rate
                )
347 348 349
                fluid_result = scheduler()
                scheduler.get_lr()
                scheduler.step()
350 351 352
                self.assertAlmostEqual(
                    right_result,
                    fluid_result,
353
                    msg='Failed lr scheduler in epoch {}, Python result is {}, Fluid result is {}'.format(
354 355 356
                        epoch, right_result, fluid_result
                    ),
                )
357 358

            with self.assertRaises(TypeError):
359
                lr = paddle.optimizer.lr.StepDecay(learning_rate, "test", 0.1)
360 361

            with self.assertRaises(ValueError):
362
                lr = paddle.optimizer.lr.StepDecay(learning_rate, 20, 2)
363

364 365 366 367
    def test_LambdaDecay(self):
        with fluid.dygraph.guard():
            learning_rate = 0.5
            lr_lambda = lambda x: 0.95**x
368 369 370
            scheduler = paddle.optimizer.lr.LambdaDecay(
                learning_rate, lr_lambda
            )
371

372
            linear = paddle.nn.Linear(10, 10)
L
LoneRanger 已提交
373 374
            adam = paddle.optimizer.Adam(
                scheduler, parameters=linear.parameters()
375
            )
376 377 378

            for epoch in range(30):
                right_result = lambda_decay(epoch, learning_rate, lr_lambda)
379 380 381
                fluid_result = scheduler()
                scheduler.get_lr()
                scheduler.step()
382 383 384
                self.assertAlmostEqual(
                    right_result,
                    fluid_result,
385
                    msg='Failed lr scheduler in epoch {}, Python result is {}, Fluid result is {}'.format(
386 387 388
                        epoch, right_result, fluid_result
                    ),
                )
389 390

            with self.assertRaises(TypeError):
391
                lr = paddle.optimizer.lr.LambdaDecay(learning_rate, "test")
392

393

394 395
class TestLearningRateDecay(unittest.TestCase):
    def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
Q
QI JUN 已提交
396 397 398 399
        places = [fluid.CPUPlace()]
        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
        for place in places:
400 401 402
            self.check_decay_with_place(
                place, python_decay_fn, fluid_decay_fn, kwargs
            )
Q
QI JUN 已提交
403

404 405 406
    def check_decay_with_place(
        self, place, python_decay_fn, fluid_decay_fn, kwargs
    ):
407 408
        main_prog = fluid.Program()
        startup_prog = fluid.Program()
Q
QI JUN 已提交
409

410
        with fluid.program_guard(main_prog, startup_prog):
411
            decayed_lr = fluid_decay_fn(**kwargs)
Q
Qiao Longfei 已提交
412 413 414 415

        place = fluid.CPUPlace()
        exe = fluid.Executor(place)

416
        exe.run(startup_prog)
417

Q
Qiao Longfei 已提交
418
        for step in range(10):
419 420 421
            # Step of NoamDecay starts from 1.
            if python_decay_fn.__name__ == 'noam_decay':
                step += 1
422 423 424 425
            (lr_val,) = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
            python_decayed_lr = python_decay_fn(
                global_step=float(step), **kwargs
            )
Y
Yu Yang 已提交
426 427 428
            self.assertAlmostEqual(
                python_decayed_lr,
                lr_val[0],
429
                msg='Failed lr scheduler is {}, step {}, Python result is {}, Fluid result is {}'.format(
430 431 432 433 434 435
                    python_decay_fn.__name__,
                    str(step),
                    str(python_decayed_lr),
                    str(lr_val[0]),
                ),
            )
Q
Qiao Longfei 已提交
436 437

    def test_decay(self):
438 439 440 441
        common_kwargs_true = {
            "learning_rate": 1.0,
            "decay_steps": 5,
            "decay_rate": 0.5,
442
            "staircase": True,
443 444 445 446
        }
        common_kwargs_false = copy.deepcopy(common_kwargs_true)
        common_kwargs_false["staircase"] = False

Q
Qiao Longfei 已提交
447
        decay_fns = [
D
Difer 已提交
448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472
            (
                exponential_decay,
                paddle.optimizer.lr.exponential_decay,
                common_kwargs_true,
            ),
            (
                exponential_decay,
                paddle.optimizer.lr.exponential_decay,
                common_kwargs_false,
            ),
            (
                natural_exp_decay,
                paddle.optimizer.lr.natural_exp_decay,
                common_kwargs_true,
            ),
            (
                natural_exp_decay,
                paddle.optimizer.lr.natural_exp_decay,
                common_kwargs_false,
            ),
            (
                inverse_time_decay,
                paddle.optimizer.lr.inverse_time_decay,
                common_kwargs_true,
            ),
473 474
            (
                inverse_time_decay,
D
Difer 已提交
475
                paddle.optimizer.lr.inverse_time_decay,
476 477 478 479
                common_kwargs_false,
            ),
            (
                polynomial_decay,
D
Difer 已提交
480
                paddle.optimizer.lr.polynomial_decay,
481 482 483 484
                {"learning_rate": 1.0, "decay_steps": 5, "cycle": True},
            ),
            (
                polynomial_decay,
D
Difer 已提交
485
                paddle.optimizer.lr.polynomial_decay,
486 487 488 489
                {"learning_rate": 1.0, "decay_steps": 5, "cycle": False},
            ),
            (
                piecewise_decay,
D
Difer 已提交
490
                paddle.optimizer.lr.piecewise_decay,
491 492 493 494
                {"boundaries": [3, 6, 9], "values": [0.1, 0.2, 0.3, 0.4]},
            ),
            (
                cosine_decay,
D
Difer 已提交
495
                paddle.optimizer.lr.cosine_decay,
496 497 498 499
                {"learning_rate": 0.1, "step_each_epoch": 100, "epochs": 120},
            ),
            (
                noam_decay,
D
Difer 已提交
500
                paddle.optimizer.lr.noam_decay,
501 502
                {"d_model": 0.01, "warmup_steps": 200, "learning_rate": 2.0},
            ),
Q
Qiao Longfei 已提交
503 504
        ]

505
        for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
506 507 508 509 510 511 512 513
            print(
                "class="
                + self.__class__.__name__
                + " decay_fn="
                + py_decay_fn.__name__
                + " kwargs="
                + str(kwargs)
            )
Q
Qiao Longfei 已提交
514 515 516
            main_program = framework.Program()
            startup_program = framework.Program()
            with framework.program_guard(main_program, startup_program):
517
                self.check_decay(py_decay_fn, fluid_decay_fn, kwargs)
Q
Qiao Longfei 已提交
518 519


520
class TestLinearWamrupLearningRateDecay(unittest.TestCase):
521 522 523
    def check_decay_with_place(
        self, place, python_decay_fn, fluid_decay_fn, kwargs
    ):
524 525 526 527
        main_prog = fluid.Program()
        startup_prog = fluid.Program()

        warmup_steps = 10
528
        start_lr = 0.1 / 3.0
529 530 531
        end_lr = 0.1

        with fluid.program_guard(main_prog, startup_prog):
D
Difer 已提交
532
            decayed_lr = paddle.optimizer.lr.linear_lr_warmup(
533 534
                fluid_decay_fn(**kwargs), warmup_steps, start_lr, end_lr
            )
535 536 537 538 539 540

        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_prog)

        for step in range(20):
541 542 543
            # Step of NoamDecay starts from 1.
            if fluid_decay_fn.__name__ == 'noam_decay':
                step += 1
544
            (lr_val,) = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
545
            if step < warmup_steps:
546 547 548
                python_decayed_lr = linear_lr_warmup(
                    float(step), warmup_steps, start_lr, end_lr
                )
549
            else:
550 551 552
                python_decayed_lr = python_decay_fn(
                    global_step=float(step), **kwargs
                )
553 554 555
            self.assertAlmostEqual(
                python_decayed_lr,
                lr_val[0],
556
                msg='Test {} Failed, step {}, Python result is {}, Fluid result is {}'.format(
557 558 559 560 561 562
                    python_decay_fn.__name__,
                    str(step),
                    str(python_decayed_lr),
                    str(lr_val[0]),
                ),
            )
563 564


Q
qingqing01 已提交
565 566 567 568 569 570 571 572
class TestLinearWamrupLearningRateDecayWithScalarInput(unittest.TestCase):
    def run_scalar_lr(self, place, lr, start_lr, end_lr):
        main_prog = fluid.Program()
        startup_prog = fluid.Program()

        warmup_steps = 10

        with fluid.program_guard(main_prog, startup_prog):
D
Difer 已提交
573
            decayed_lr = paddle.optimizer.lr.linear_lr_warmup(
574 575
                lr, warmup_steps, start_lr, end_lr
            )
Q
qingqing01 已提交
576 577 578 579 580

        exe = fluid.Executor(place)
        exe.run(startup_prog)

        for step in range(20):
581
            (lr_val,) = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
Q
qingqing01 已提交
582
            if step < warmup_steps:
583 584 585
                expected_lr = linear_lr_warmup(
                    float(step), warmup_steps, start_lr, end_lr
                )
Q
qingqing01 已提交
586 587 588 589 590
            else:
                expected_lr = lr
            self.assertAlmostEqual(
                expected_lr,
                lr_val[0],
591
                msg='Test failed, step {}, expected {}, but got {}'.format(
592 593 594
                    step, expected_lr, lr_val[0]
                ),
            )
Q
qingqing01 已提交
595 596 597 598 599 600 601 602 603 604 605

    def test_scalar_lr(self):
        def run_places(lr, start_lr, end_lr):
            places = [fluid.CPUPlace()]
            if core.is_compiled_with_cuda():
                places.append(fluid.CUDAPlace(0))
            for p in places:
                self.run_scalar_lr(p, lr, start_lr, end_lr)

        # float
        lr = 0.2
606
        start_lr = 0.1 / 3.0
Q
qingqing01 已提交
607 608 609 610
        end_lr = 0.2
        run_places(lr, start_lr, end_lr)

        # int end_lr
611 612
        lr = 2.0
        start_lr = 0.1 / 3.0
Q
qingqing01 已提交
613 614 615 616 617 618 619 620 621 622
        end_lr = 1
        run_places(lr, start_lr, end_lr)

        # int
        lr = 1
        start_lr = 0
        end_lr = 1
        run_places(lr, start_lr, end_lr)


Q
Qiao Longfei 已提交
623 624
if __name__ == '__main__':
    unittest.main()