test_learning_rate_scheduler.py 20.6 KB
Newer Older
Q
Qiao Longfei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import copy
16 17
import math
import unittest
18 19 20

import numpy as np

21
import paddle
22
import paddle.fluid as fluid
Q
QI JUN 已提交
23
import paddle.fluid.core as core
24 25
import paddle.fluid.framework as framework
import paddle.fluid.layers as layers
Q
Qiao Longfei 已提交
26 27


28 29 30
def exponential_decay(
    learning_rate, global_step, decay_steps, decay_rate, staircase=False
):
Y
Yu Yang 已提交
31
    exponent = global_step / decay_steps
Q
Qiao Longfei 已提交
32 33 34 35 36
    if staircase:
        exponent = math.floor(exponent)
    return learning_rate * decay_rate**exponent


37 38 39
def natural_exp_decay(
    learning_rate, global_step, decay_steps, decay_rate, staircase=False
):
Q
Qiao Longfei 已提交
40 41 42 43 44 45
    exponent = float(global_step) / float(decay_steps)
    if staircase:
        exponent = math.floor(exponent)
    return learning_rate * math.exp(-1 * decay_rate * exponent)


46 47 48
def inverse_time_decay(
    learning_rate, global_step, decay_steps, decay_rate, staircase=False
):
Q
Qiao Longfei 已提交
49 50 51 52 53 54
    temp = float(global_step) / float(decay_steps)
    if staircase:
        temp = math.floor(temp)
    return learning_rate / (1 + decay_rate * temp)


55 56 57 58 59 60 61 62
def polynomial_decay(
    learning_rate,
    global_step,
    decay_steps,
    end_learning_rate=0.0001,
    power=1.0,
    cycle=False,
):
63 64 65 66 67 68 69
    if cycle:
        div = math.ceil(global_step / float(decay_steps))
        if div == 0:
            div = 1
        decay_steps = decay_steps * div
    else:
        global_step = min(global_step, decay_steps)
70 71 72
    return (learning_rate - end_learning_rate) * (
        (1 - float(global_step) / float(decay_steps)) ** power
    ) + end_learning_rate
73 74 75 76 77 78 79 80


def piecewise_decay(global_step, boundaries, values):
    assert len(boundaries) + 1 == len(values)
    for i in range(len(boundaries)):
        if global_step < boundaries[i]:
            return values[i]
    return values[len(values) - 1]
Q
Qiao Longfei 已提交
81

82

S
shippingwang 已提交
83 84
def cosine_decay(global_step, learning_rate, step_each_epoch, epochs):
    cur_epoch = math.floor(global_step / step_each_epoch)
85 86 87
    decayed_lr = (
        learning_rate * 0.5 * (math.cos(cur_epoch * math.pi / epochs) + 1)
    )
S
shippingwang 已提交
88 89 90
    return decayed_lr


91 92 93 94 95 96 97 98
def noam_decay(global_step, d_model, warmup_steps, learning_rate=1.0):
    a = math.pow(global_step, -0.5)
    b = math.pow(warmup_steps, -1.5) * global_step
    decayed_lr = learning_rate * math.pow(d_model, -0.5) * min(a, b)

    return decayed_lr


99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
def linear_lr_warmup(global_step, warmup_steps, start_lr, end_lr):
    linear_step = end_lr - start_lr
    decayed_lr = start_lr + linear_step * (global_step / warmup_steps)
    return decayed_lr


def multi_step_decay(global_step, learning_rate, milestones, decay_rate=0.1):
    for i in range(len(milestones)):
        if global_step < milestones[i]:
            return learning_rate * math.pow(decay_rate, i)

    return learning_rate * math.pow(decay_rate, len(milestones))


def step_decay(global_step, learning_rate, step_size, decay_rate=0.1):
    return learning_rate * math.pow(decay_rate, global_step // step_size)


117 118 119 120
def lambda_decay(global_step, learning_rate, lr_lambda):
    return learning_rate * lr_lambda(global_step)


121
class TestLearningRateDecayDygraph(unittest.TestCase):
122 123 124
    def test_LR_state_dict(self):
        with fluid.dygraph.guard():
            x = np.random.uniform(-1, 1, [3, 10]).astype("float32")
125
            linear = paddle.nn.Linear(10, 10)
126 127 128 129 130 131
            input = fluid.dygraph.to_variable(x)

            Exponential_scheduler = fluid.dygraph.ExponentialDecay(
                learning_rate=0.1,
                decay_steps=10000,
                decay_rate=0.5,
132 133
                staircase=True,
            )
134 135
            Step_scheduler = fluid.dygraph.StepDecay(0.5, step_size=3)
            Reducelr_scheduler = fluid.dygraph.ReduceLROnPlateau(
136 137 138 139 140 141 142 143 144 145 146 147 148 149
                learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3
            )

            adam1 = fluid.optimizer.Adam(
                learning_rate=Exponential_scheduler,
                parameter_list=linear.parameters(),
            )
            adam2 = fluid.optimizer.Adam(
                learning_rate=Step_scheduler, parameter_list=linear.parameters()
            )
            adam3 = fluid.optimizer.Adam(
                learning_rate=Reducelr_scheduler,
                parameter_list=linear.parameters(),
            )
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
            print(adam3.state_dict())

            for epoch in range(10):
                out = linear(input)
                loss = fluid.layers.reduce_mean(out)
                loss.backward()
                adam1.minimize(loss)
                adam2.minimize(loss)
                adam3.minimize(loss)
                linear.clear_gradients()

                Step_scheduler.epoch()
                Reducelr_scheduler.step(loss)

            fluid.dygraph.save_dygraph(linear.state_dict(), "save_path")

            Exponential_scheduler_test = fluid.dygraph.ExponentialDecay(
                learning_rate=0.1,
                decay_steps=10000,
                decay_rate=0.5,
170 171
                staircase=True,
            )
172 173
            Step_scheduler_test = fluid.dygraph.StepDecay(0.5, step_size=3)
            Reducelr_scheduler_test = fluid.dygraph.ReduceLROnPlateau(
174 175
                learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3
            )
176 177 178 179 180

            fluid.dygraph.save_dygraph(adam1.state_dict(), "save_path")
            _, opt_state = fluid.dygraph.load_dygraph("save_path")
            adam_test = fluid.optimizer.Adam(
                learning_rate=Exponential_scheduler_test,
181 182
                parameter_list=linear.parameters(),
            )
183
            adam_test.set_dict(opt_state)
184 185 186
            self.assertEqual(
                adam_test._learning_rate.step_num,
                adam1._learning_rate.step_num,
187 188
                "epoch_num is different before and after set_dict",
            )
189 190 191

            fluid.dygraph.save_dygraph(adam2.state_dict(), "save_path")
            _, opt_state = fluid.dygraph.load_dygraph("save_path")
192 193 194 195
            adam_test = fluid.optimizer.Adam(
                learning_rate=Step_scheduler_test,
                parameter_list=linear.parameters(),
            )
196 197
            adam_test.set_dict(opt_state)
            self.assertEqual(
198 199
                adam_test._learning_rate.epoch_num,
                adam2._learning_rate.epoch_num,
200 201
                "epoch_num is different before and after set_dict",
            )
202
            self.assertEqual(
203 204 205 206
                adam_test._learning_rate(),
                adam2._learning_rate(),
                "current learning rate is different before and after set_dict",
            )
207 208 209 210 211

            fluid.dygraph.save_dygraph(adam3.state_dict(), "save_path")
            _, opt_state = fluid.dygraph.load_dygraph("save_path")
            adam_test = fluid.optimizer.Adam(
                learning_rate=Reducelr_scheduler_test,
212 213
                parameter_list=linear.parameters(),
            )
214
            adam_test.set_dict(opt_state)
215 216 217
            self.assertEqual(
                adam_test._learning_rate.best_loss,
                adam3._learning_rate.best_loss.numpy()[0],
218 219
                "best_loss is different before and after set_dict",
            )
220 221 222
            self.assertEqual(
                adam_test._learning_rate.cooldown_counter,
                adam3._learning_rate.cooldown_counter,
223 224
                "cooldown_counter is different before and after set_dict",
            )
225 226 227
            self.assertEqual(
                adam_test._learning_rate.num_bad_epochs,
                adam3._learning_rate.num_bad_epochs,
228 229 230 231 232 233 234
                "num_bad_epochs is different before and after set_dict",
            )
            self.assertEqual(
                adam_test._learning_rate.epoch_num,
                adam3._learning_rate.epoch_num,
                "epoch is different before and after set_dict",
            )
235
            self.assertEqual(
236 237 238 239
                adam_test._learning_rate(),
                adam3._learning_rate(),
                "current learning rate is different before and after set_dict",
            )
240

241
    def test_NoamDecay(self):
242 243 244 245 246 247 248
        with fluid.dygraph.guard():
            d_model = 0.01
            warmup_steps = 200
            learning_rate = 2.0
            lr = fluid.layers.noam_decay(d_model, warmup_steps, learning_rate)
            for step in range(5):
                step += 1
249 250 251
                right_result = noam_decay(
                    step, d_model, warmup_steps, learning_rate
                )
252 253 254 255 256
                fluid_result = lr()

                self.assertAlmostEqual(
                    right_result,
                    fluid_result[0],
257 258 259 260
                    msg='Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'.format(
                        step, right_result, fluid_result[0]
                    ),
                )
261

262 263
    def test_LinearLrWarmup(self):
        with fluid.dygraph.guard():
264 265 266 267 268 269 270 271 272
            lr = fluid.layers.polynomial_decay(
                learning_rate=1.0,
                decay_steps=10,
                end_learning_rate=0.0,
                power=1.0,
            )
            lr = fluid.layers.linear_lr_warmup(
                learning_rate=lr, warmup_steps=2, start_lr=0.0, end_lr=1.0
            )
273 274 275 276 277 278

            right_result = [0.5, 0.9, 0.8, 0.7, 0.6]
            for i in range(5):

                t = lr()

279 280 281
                np.testing.assert_allclose(
                    t.numpy()[0].item(), right_result[i], rtol=1e-05
                )
282 283

            with self.assertRaises(TypeError):
284 285 286 287 288 289
                lr = fluid.layers.linear_lr_warmup(
                    learning_rate="fake_lr",
                    warmup_steps=2,
                    start_lr=0.0,
                    end_lr=1.0,
                )
290 291 292 293 294 295

    def test_MultiStepDecay(self):
        with fluid.dygraph.guard():
            learning_rate = 0.5
            milestones = [2, 4, 8]
            decay_rate = 0.2
296
            linear = paddle.nn.Linear(10, 10)
297

298 299 300
            scheduler = fluid.dygraph.MultiStepDecay(
                learning_rate, milestones, decay_rate
            )
301 302

            adam = fluid.optimizer.AdamOptimizer(
303 304
                learning_rate=scheduler, parameter_list=linear.parameters()
            )
305
            for epoch in range(10):
306 307 308
                right_result = multi_step_decay(
                    epoch, learning_rate, milestones, decay_rate
                )
309
                fluid_result = adam.current_step_lr()
310 311 312 313
                scheduler.epoch()
                self.assertAlmostEqual(
                    right_result,
                    fluid_result,
314 315 316 317
                    msg='Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.format(
                        epoch, right_result, fluid_result
                    ),
                )
318 319

            with self.assertRaises(ValueError):
320 321 322
                lr = fluid.dygraph.MultiStepDecay(
                    learning_rate, [30, 50, 20], 0.1
                )
323 324

            with self.assertRaises(ValueError):
325 326 327
                lr = fluid.dygraph.MultiStepDecay(
                    learning_rate, [20, 30, 50], 1
                )
328 329 330 331 332

            with self.assertRaises(TypeError):
                lr = fluid.dygraph.MultiStepDecay("test", [20, 30, 50])

            with self.assertRaises(ValueError):
333
                lr = fluid.dygraph.MultiStepDecay(-1, [20, 30, 50])
334 335 336 337 338 339

    def test_StepDecay(self):
        with fluid.dygraph.guard():
            learning_rate = 0.5
            step_size = 3
            decay_rate = 0.2
340 341 342
            scheduler = fluid.dygraph.StepDecay(
                learning_rate, step_size, decay_rate
            )
343
            for epoch in range(10):
344 345 346
                right_result = step_decay(
                    epoch, learning_rate, step_size, decay_rate
                )
347 348 349 350 351
                fluid_result = scheduler().numpy()[0]
                scheduler.epoch()
                self.assertAlmostEqual(
                    right_result,
                    fluid_result,
352 353 354 355
                    msg='Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.format(
                        epoch, right_result, fluid_result
                    ),
                )
356 357

            with self.assertRaises(TypeError):
358
                lr = fluid.dygraph.StepDecay(learning_rate, "test", 0.1)
359 360

            with self.assertRaises(ValueError):
361
                lr = fluid.dygraph.StepDecay(learning_rate, 20, 2)
362

363 364 365 366 367 368
    def test_LambdaDecay(self):
        with fluid.dygraph.guard():
            learning_rate = 0.5
            lr_lambda = lambda x: 0.95**x
            scheduler = fluid.dygraph.LambdaDecay(learning_rate, lr_lambda)

369
            linear = paddle.nn.Linear(10, 10)
370 371 372
            adam = fluid.optimizer.Adam(
                scheduler, parameter_list=linear.parameters()
            )
373 374 375 376 377 378 379 380

            for epoch in range(30):
                right_result = lambda_decay(epoch, learning_rate, lr_lambda)
                fluid_result = scheduler().numpy()[0]
                scheduler.epoch()
                self.assertAlmostEqual(
                    right_result,
                    fluid_result,
381 382 383 384
                    msg='Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.format(
                        epoch, right_result, fluid_result
                    ),
                )
385 386 387 388

            with self.assertRaises(TypeError):
                lr = fluid.dygraph.LambdaDecay(learning_rate, "test")

389

390 391
class TestLearningRateDecay(unittest.TestCase):
    def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
Q
QI JUN 已提交
392 393 394 395
        places = [fluid.CPUPlace()]
        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
        for place in places:
396 397 398
            self.check_decay_with_place(
                place, python_decay_fn, fluid_decay_fn, kwargs
            )
Q
QI JUN 已提交
399

400 401 402
    def check_decay_with_place(
        self, place, python_decay_fn, fluid_decay_fn, kwargs
    ):
403 404
        main_prog = fluid.Program()
        startup_prog = fluid.Program()
Q
QI JUN 已提交
405

406
        with fluid.program_guard(main_prog, startup_prog):
407
            decayed_lr = fluid_decay_fn(**kwargs)
Q
Qiao Longfei 已提交
408 409 410 411

        place = fluid.CPUPlace()
        exe = fluid.Executor(place)

412
        exe.run(startup_prog)
413

Q
Qiao Longfei 已提交
414
        for step in range(10):
415 416 417
            # Step of NoamDecay starts from 1.
            if python_decay_fn.__name__ == 'noam_decay':
                step += 1
418 419 420 421
            (lr_val,) = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
            python_decayed_lr = python_decay_fn(
                global_step=float(step), **kwargs
            )
Y
Yu Yang 已提交
422 423 424
            self.assertAlmostEqual(
                python_decayed_lr,
                lr_val[0],
425 426 427 428 429 430 431
                msg='Failed lr scheduler is {0}, step {1}, Python result is {2}, Fluid result is {3}'.format(
                    python_decay_fn.__name__,
                    str(step),
                    str(python_decayed_lr),
                    str(lr_val[0]),
                ),
            )
Q
Qiao Longfei 已提交
432 433

    def test_decay(self):
434 435 436 437
        common_kwargs_true = {
            "learning_rate": 1.0,
            "decay_steps": 5,
            "decay_rate": 0.5,
438
            "staircase": True,
439 440 441 442
        }
        common_kwargs_false = copy.deepcopy(common_kwargs_true)
        common_kwargs_false["staircase"] = False

Q
Qiao Longfei 已提交
443
        decay_fns = [
444 445 446 447 448
            (exponential_decay, layers.exponential_decay, common_kwargs_true),
            (exponential_decay, layers.exponential_decay, common_kwargs_false),
            (natural_exp_decay, layers.natural_exp_decay, common_kwargs_true),
            (natural_exp_decay, layers.natural_exp_decay, common_kwargs_false),
            (inverse_time_decay, layers.inverse_time_decay, common_kwargs_true),
449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478
            (
                inverse_time_decay,
                layers.inverse_time_decay,
                common_kwargs_false,
            ),
            (
                polynomial_decay,
                layers.polynomial_decay,
                {"learning_rate": 1.0, "decay_steps": 5, "cycle": True},
            ),
            (
                polynomial_decay,
                layers.polynomial_decay,
                {"learning_rate": 1.0, "decay_steps": 5, "cycle": False},
            ),
            (
                piecewise_decay,
                layers.piecewise_decay,
                {"boundaries": [3, 6, 9], "values": [0.1, 0.2, 0.3, 0.4]},
            ),
            (
                cosine_decay,
                layers.cosine_decay,
                {"learning_rate": 0.1, "step_each_epoch": 100, "epochs": 120},
            ),
            (
                noam_decay,
                layers.noam_decay,
                {"d_model": 0.01, "warmup_steps": 200, "learning_rate": 2.0},
            ),
Q
Qiao Longfei 已提交
479 480
        ]

481
        for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
482 483 484 485 486 487 488 489
            print(
                "class="
                + self.__class__.__name__
                + " decay_fn="
                + py_decay_fn.__name__
                + " kwargs="
                + str(kwargs)
            )
Q
Qiao Longfei 已提交
490 491 492
            main_program = framework.Program()
            startup_program = framework.Program()
            with framework.program_guard(main_program, startup_program):
493
                self.check_decay(py_decay_fn, fluid_decay_fn, kwargs)
Q
Qiao Longfei 已提交
494 495


496
class TestLinearWamrupLearningRateDecay(unittest.TestCase):
497 498 499
    def check_decay_with_place(
        self, place, python_decay_fn, fluid_decay_fn, kwargs
    ):
500 501 502 503
        main_prog = fluid.Program()
        startup_prog = fluid.Program()

        warmup_steps = 10
504
        start_lr = 0.1 / 3.0
505 506 507
        end_lr = 0.1

        with fluid.program_guard(main_prog, startup_prog):
508 509 510
            decayed_lr = layers.linear_lr_warmup(
                fluid_decay_fn(**kwargs), warmup_steps, start_lr, end_lr
            )
511 512 513 514 515 516

        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_prog)

        for step in range(20):
517 518 519
            # Step of NoamDecay starts from 1.
            if fluid_decay_fn.__name__ == 'noam_decay':
                step += 1
520
            (lr_val,) = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
521
            if step < warmup_steps:
522 523 524
                python_decayed_lr = linear_lr_warmup(
                    float(step), warmup_steps, start_lr, end_lr
                )
525
            else:
526 527 528
                python_decayed_lr = python_decay_fn(
                    global_step=float(step), **kwargs
                )
529 530 531
            self.assertAlmostEqual(
                python_decayed_lr,
                lr_val[0],
532 533 534 535 536 537 538
                msg='Test {0} Failed, step {1}, Python result is {2}, Fluid result is {3}'.format(
                    python_decay_fn.__name__,
                    str(step),
                    str(python_decayed_lr),
                    str(lr_val[0]),
                ),
            )
539 540


Q
qingqing01 已提交
541 542 543 544 545 546 547 548
class TestLinearWamrupLearningRateDecayWithScalarInput(unittest.TestCase):
    def run_scalar_lr(self, place, lr, start_lr, end_lr):
        main_prog = fluid.Program()
        startup_prog = fluid.Program()

        warmup_steps = 10

        with fluid.program_guard(main_prog, startup_prog):
549 550 551
            decayed_lr = layers.linear_lr_warmup(
                lr, warmup_steps, start_lr, end_lr
            )
Q
qingqing01 已提交
552 553 554 555 556

        exe = fluid.Executor(place)
        exe.run(startup_prog)

        for step in range(20):
557
            (lr_val,) = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
Q
qingqing01 已提交
558
            if step < warmup_steps:
559 560 561
                expected_lr = linear_lr_warmup(
                    float(step), warmup_steps, start_lr, end_lr
                )
Q
qingqing01 已提交
562 563 564 565 566 567
            else:
                expected_lr = lr
            self.assertAlmostEqual(
                expected_lr,
                lr_val[0],
                msg='Test failed, step {0}, expected {1}, but got {2}'.format(
568 569 570
                    step, expected_lr, lr_val[0]
                ),
            )
Q
qingqing01 已提交
571 572 573 574 575 576 577 578 579 580 581

    def test_scalar_lr(self):
        def run_places(lr, start_lr, end_lr):
            places = [fluid.CPUPlace()]
            if core.is_compiled_with_cuda():
                places.append(fluid.CUDAPlace(0))
            for p in places:
                self.run_scalar_lr(p, lr, start_lr, end_lr)

        # float
        lr = 0.2
582
        start_lr = 0.1 / 3.0
Q
qingqing01 已提交
583 584 585 586
        end_lr = 0.2
        run_places(lr, start_lr, end_lr)

        # int end_lr
587 588
        lr = 2.0
        start_lr = 0.1 / 3.0
Q
qingqing01 已提交
589 590 591 592 593 594 595 596 597 598
        end_lr = 1
        run_places(lr, start_lr, end_lr)

        # int
        lr = 1
        start_lr = 0
        end_lr = 1
        run_places(lr, start_lr, end_lr)


Q
Qiao Longfei 已提交
599 600
if __name__ == '__main__':
    unittest.main()