test_lr_scheduler.py 33.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import numpy as np
import unittest

import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core


24 25 26
def reduce_lr_on_plateau(
    decay_rate, threshold, cooldown, patience, m, n, loss, var_list
):
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
    def is_better(current, best, m, n):
        if m == 'min' and n == 'rel':
            return current < best - best * threshold
        elif m == 'min' and n == 'abs':
            return current < best - threshold
        elif m == 'max' and n == 'rel':
            return current > best + best * threshold
        else:  # mode == 'max' and epsilon_mode == 'abs':
            return current > best + threshold

    if var_list[2] > 0:
        var_list[2] -= 1
        return var_list[1]

    if is_better(loss, var_list[0], m, n):
        var_list[0] = loss
        var_list[3] = 0
    else:
        var_list[3] += 1
        if var_list[3] > patience:
            var_list[2] = cooldown
            var_list[3] = 0
            new_lr = var_list[1] * decay_rate
            var_list[1] = new_lr if var_list[1] - new_lr > 1e-8 else var_list[1]

    return var_list[1]


55
class TestReduceOnPlateauDecay:
56 57 58
    def test_ReduceLR(self):
        # the decay rate must be less than 1.0
        with self.assertRaises(ValueError):
59
            paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=2.0)
60 61
        # the mode must be "min" or "max"
        with self.assertRaises(ValueError):
62
            paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, mode="test")
63 64
        # the threshold_mode must be "rel" or "abs"
        with self.assertRaises(ValueError):
65 66 67
            paddle.optimizer.lr.ReduceOnPlateau(
                learning_rate=1.0, threshold_mode="test"
            )
68
        with self.assertRaises(TypeError):
69
            paddle.optimizer.lr.ReduceOnPlateau(learning_rate="test")
70
        with self.assertRaises(TypeError):
71
            paddle.optimizer.lr.ReduceOnPlateau(learning_rate=0.5).step("test")
72 73 74 75 76 77

        places = [paddle.CPUPlace()]
        if core.is_compiled_with_cuda():
            places.append(paddle.CUDAPlace(0))

        for place in places:
78 79 80
            for m, n in zip(
                ['min', 'max', 'min', 'max'], ['rel', 'rel', 'abs', 'abs']
            ):
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
                kwargs = {
                    'learning_rate': 1.0,
                    'mode': m,
                    'factor': 0.5,
                    'patience': 3,
                    'threshold': 1e-4,
                    'threshold_mode': n,
                    'cooldown': 1,
                    'min_lr': 0,
                    'epsilon': 1e-8,
                    'verbose': False,
                }
                paddle.enable_static()
                self._test_static(place, kwargs)
                paddle.disable_static(place)
                self._test_dygraph(place, kwargs)
                paddle.enable_static()

    def _test_static(self, place, kwargs):
        paddle.enable_static()

        best = float("-10000") if kwargs['mode'] == "max" else float("10000")
        current_lr = 1.0
        cooldown_counter = 0
        num_bad_epochs = 0
        var_list = [best, current_lr, cooldown_counter, num_bad_epochs]

        main_prog = paddle.static.Program()
        start_prog = paddle.static.Program()
        with paddle.static.program_guard(main_prog, start_prog):
111 112 113
            x = fluid.layers.create_global_var(
                [1], 1, 'float32', persistable=True
            )
114 115
            paddle.increment(x)
            loss = paddle.sin(x)
116
            scheduler = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
117 118 119 120 121 122 123 124 125 126
            adam = paddle.optimizer.Adam(learning_rate=scheduler)
            adam.minimize(loss)
            lr_var = adam._global_learning_rate()
            test_prog = main_prog.clone()

        exe = paddle.static.Executor(place)
        exe.run(start_prog)

        for epoch in range(20):
            for batch_id in range(1):
127 128 129
                out, actual_lr = exe.run(
                    main_prog, fetch_list=[loss.name, lr_var.name]
                )
130
                expected_lr = reduce_lr_on_plateau(
131 132 133 134 135 136 137 138 139
                    kwargs['factor'],
                    kwargs['threshold'],
                    kwargs['cooldown'],
                    kwargs['patience'],
                    kwargs['mode'],
                    kwargs['threshold_mode'],
                    out[0],
                    var_list,
                )
140 141 142 143 144 145 146

            scheduler.step(out[0])
            actual_lr = scheduler()
            self.assertEqual(actual_lr, np.array(expected_lr))

        for epoch in range(10):
            for batch_id in range(1):
147 148 149
                out, actual_lr = exe.run(
                    test_prog, fetch_list=[loss.name, lr_var.name]
                )
150
                expected_lr = reduce_lr_on_plateau(
151 152 153 154 155 156 157 158 159
                    kwargs['factor'],
                    kwargs['threshold'],
                    kwargs['cooldown'],
                    kwargs['patience'],
                    kwargs['mode'],
                    kwargs['threshold_mode'],
                    out[0],
                    var_list,
                )
160 161 162 163 164 165 166 167 168 169 170 171 172 173
            scheduler.step(out[0])
            actual_lr = scheduler()
            self.assertEqual(actual_lr, np.array(expected_lr))

    def _test_dygraph(self, place, kwargs):
        paddle.disable_static(place)

        best = float("-10000") if kwargs['mode'] == "max" else float("10000")
        current_lr = 1.0
        cooldown_counter = 0
        num_bad_epochs = 0
        var_list = [best, current_lr, cooldown_counter, num_bad_epochs]

        linear = paddle.nn.Linear(10, 10)
174
        scheduler = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
175 176 177
        adam = paddle.optimizer.Adam(
            learning_rate=scheduler, parameters=linear.parameters()
        )
178 179 180 181 182 183 184 185 186 187 188 189 190 191

        for epoch in range(20):
            for batch_id in range(1):
                x = paddle.to_tensor(epoch).astype('float32')
                loss = paddle.sin(x)
                loss.backward()
                adam.step()
                adam.clear_grad()

            scheduler.step(loss)
            # get lr from paddle
            current_lr = adam.get_lr()
            # get lr form python
            expected_lr = reduce_lr_on_plateau(
192 193 194 195 196 197 198 199 200
                kwargs['factor'],
                kwargs['threshold'],
                kwargs['cooldown'],
                kwargs['patience'],
                kwargs['mode'],
                kwargs['threshold_mode'],
                loss,
                var_list,
            )
201 202
            self.assertEqual(current_lr, expected_lr)
        state_dict = adam.state_dict()
203
        scheduler1 = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
204 205 206
        adam1 = paddle.optimizer.Adam(
            learning_rate=scheduler1, parameters=linear.parameters()
        )
207
        adam1.set_state_dict(state_dict)
208 209 210
        self.assertEqual(
            scheduler.cooldown_counter, scheduler1.cooldown_counter
        )
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
        self.assertEqual(scheduler.best.numpy()[0], scheduler1.best)
        self.assertEqual(scheduler.num_bad_epochs, scheduler1.num_bad_epochs)
        self.assertEqual(scheduler.last_epoch, scheduler1.last_epoch)
        self.assertEqual(scheduler.last_lr, scheduler1.last_lr)


def noam_lr(epoch_num, d_model, warmup_steps, learning_rate=1.0, verbose=False):
    if epoch_num == 0:
        a = 1
    else:
        a = math.pow(epoch_num, -0.5)
    b = math.pow(warmup_steps, -1.5) * epoch_num
    return learning_rate * math.pow(d_model, -0.5) * min(a, b)


def lambda_lr(epoch_num, learning_rate, lr_lambda, verbose=False):
    return learning_rate * lr_lambda(epoch_num)


G
guguguzi 已提交
230 231 232 233 234 235 236
def multiplicative_lr(epoch_num, learning_rate, lr_lambda, verbose=False):
    latest_lr = learning_rate
    for i in range(epoch_num):
        latest_lr = latest_lr * lr_lambda(i + 1)
    return latest_lr


237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
def piecewise_lr(epoch_num, boundaries, values, verbose=False):
    assert len(boundaries) + 1 == len(values)
    for i in range(len(boundaries)):
        if epoch_num < boundaries[i]:
            return values[i]
    return values[len(values) - 1]


def exponential_lr(epoch_num, learning_rate, gamma, verbose=False):
    return learning_rate * gamma**epoch_num


def natural_exp_lr(epoch_num, learning_rate, gamma, verbose=False):
    return learning_rate * math.exp(-1 * gamma * epoch_num)


def inverse_time_lr(epoch_num, learning_rate, gamma, verbose=False):
    return learning_rate / (1 + gamma * epoch_num)


257 258 259 260 261 262 263 264 265
def polynomial_lr(
    epoch_num,
    learning_rate,
    decay_steps,
    end_lr=0.0001,
    power=1.0,
    cycle=False,
    verbose=False,
):
266 267 268 269 270 271 272 273 274

    if cycle:
        div = math.ceil(epoch_num / float(decay_steps))
        if epoch_num == 0:
            div = 1
        decay_steps = decay_steps * div
    else:
        epoch_num = min(epoch_num, decay_steps)
    return (learning_rate - end_lr) * (
275 276
        (1 - float(epoch_num) / float(decay_steps)) ** power
    ) + end_lr
277 278 279 280 281

    def get_lr(self):
        if self.last_epoch == 0:
            return self.base_lr
        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
282 283 284 285 286 287
            return (
                self.last_lr
                + (self.base_lr - self.eta_min)
                * (1 - math.cos(math.pi / self.T_max))
                / 2
            )
288 289

        return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
290 291
            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)
        ) * (self.last_lr - self.eta_min) + self.eta_min
292 293 294 295 296


cosine_annealing_lr_current = None


297 298 299
def cosine_annealing_lr(
    epoch_num, learning_rate, T_max, eta_min=0, verbose=False
):
300 301 302 303
    global cosine_annealing_lr_current
    if epoch_num == 0:
        cosine_annealing_lr_current = learning_rate
    elif (epoch_num - 1 - T_max) % (2 * T_max) == 0:
304 305 306 307 308 309
        cosine_annealing_lr_current = (
            cosine_annealing_lr_current
            + (learning_rate - eta_min)
            * (1 - math.cos(math.pi / float(T_max)))
            / 2
        )
310
    else:
311
        cosine_annealing_lr_current = (
312 313 314 315
            1 + math.cos(math.pi * epoch_num / float(T_max))
        ) / (1 + math.cos(math.pi * (epoch_num - 1) / float(T_max))) * (
            cosine_annealing_lr_current - eta_min
        ) + eta_min
316 317 318
    return cosine_annealing_lr_current


319 320 321
def linear_warmup_lr(
    epoch_num, learning_rate, warmup_steps, start_lr, end_lr, verbose=False
):
322 323
    tmp = epoch_num - warmup_steps
    if tmp < 0:
324 325 326
        return start_lr + (end_lr - start_lr) * (
            float(epoch_num) / float(warmup_steps)
        )
327 328 329 330 331 332 333
    elif paddle.in_dynamic_mode():
        if tmp < 3:
            return 0.5
        elif tmp < 6:
            return 0.2
        else:
            return 0.1
334
    else:
335
        return 0.5
336 337


338 339 340
def multi_step_lr(
    epoch_num, learning_rate, milestones, gamma=0.1, verbose=False
):
341 342 343
    for i in range(len(milestones)):
        if epoch_num < milestones[i]:
            return learning_rate * (gamma**i)
344
    return learning_rate * (gamma ** len(milestones))
345 346 347 348 349 350


def step_lr(epoch_num, learning_rate, step_size, gamma=0.1, verbose=False):
    return learning_rate * math.pow(gamma, epoch_num // step_size)


351 352 353 354 355 356 357 358 359 360 361
def one_cycle_lr(
    epoch_num,
    max_learning_rate,
    total_steps,
    divide_factor=25,
    end_learning_rate=0.0001,
    phase_pct=0.3,
    anneal_strategy='cos',
    three_phase=False,
    verbose=False,
):
362 363 364 365
    initial_lr = max_learning_rate / divide_factor
    if three_phase:
        _end_steps = [
            float(phase_pct * total_steps) - 1,
366 367
            float(2 * phase_pct * total_steps) - 2,
            total_steps - 1,
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
        ]
        _schedule_phases = [
            {
                'start_lr': initial_lr,
                'end_lr': max_learning_rate,
            },
            {
                'start_lr': max_learning_rate,
                'end_lr': initial_lr,
            },
            {
                'start_lr': initial_lr,
                'end_lr': end_learning_rate,
            },
        ]
    else:
        _end_steps = [float(phase_pct * total_steps) - 1, total_steps - 1]
        _schedule_phases = [
            {
                'start_lr': initial_lr,
                'end_lr': max_learning_rate,
            },
            {
                'start_lr': max_learning_rate,
                'end_lr': end_learning_rate,
            },
        ]

    if anneal_strategy == 'cos':

        def anneal_func(start, end, pct):
            cos_out = math.cos(math.pi * pct) + 1
            return end + (start - end) / 2.0 * cos_out
401

402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
    else:

        def anneal_func(start, end, pct):
            return (end - start) * pct + start

    start_step = 0
    for i, phase in enumerate(_schedule_phases):
        end_step = _end_steps[i]
        if epoch_num <= end_step or i == len(_schedule_phases) - 1:
            pct = (epoch_num - start_step) / (end_step - start_step)
            computed_lr = anneal_func(phase['start_lr'], phase['end_lr'], pct)
            break
        start_step = end_step

    return computed_lr


419 420 421 422 423 424 425 426 427 428 429 430
def cyclic_lr(
    epoch_num,
    base_learning_rate,
    max_learning_rate,
    step_size_up,
    step_size_down,
    mode,
    exp_gamma=0.1,
    scale_fn=None,
    scale_mode='cycle',
    verbose=False,
):
431 432 433 434
    total_steps = step_size_up + step_size_down
    step_ratio = step_size_up / total_steps

    def triangular(x):
435
        return 1.0
436 437

    def triangular2(x):
438
        return 1 / (2.0 ** (x - 1))
439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455

    def exp_range(x):
        return exp_gamma**x

    if scale_fn is None:
        if mode == 'triangular':
            scale_fn = triangular
            scale_mode = 'cycle'
        elif mode == 'triangular2':
            scale_fn = triangular2
            scale_mode = 'cycle'
        elif mode == 'exp_range':
            scale_fn = exp_range
            scale_mode = 'iterations'

    cycle = math.floor(1 + epoch_num / total_steps)
    iterations = epoch_num
456
    x = 1.0 + epoch_num / total_steps - cycle
457 458 459 460 461 462 463 464 465 466 467

    if x <= step_ratio:
        scale_factor = x / step_ratio
    else:
        scale_factor = (x - 1) / (step_ratio - 1)

    base_height = (max_learning_rate - base_learning_rate) * scale_factor

    return base_learning_rate + base_height * scale_fn(eval(scale_mode))


468 469
class TestLRScheduler(unittest.TestCase):
    def _test_static(self, python_func, paddle_api, kwarg, place):
470 471 472
        scheduler = paddle_api(**kwarg)
        adam = paddle.optimizer.Adam(learning_rate=scheduler)

473 474 475 476
        main_prog = paddle.static.Program()
        start_prog = paddle.static.Program()
        with paddle.static.program_guard(main_prog, start_prog):
            x = paddle.static.data(name='x', shape=[3, 4, 5])
477 478
            loss = paddle.mean(x)

479 480 481 482 483 484 485
            adam.minimize(loss)
            lr_var = adam._global_learning_rate()
            test_prog = main_prog.clone()

        num = 0
        exe = paddle.static.Executor(place)
        exe.run(start_prog)
486

487 488 489 490
        for epoch in range(5):
            for batch_id in range(2):
                out = exe.run(
                    main_prog,
491
                    feed={'x': np.random.randn(3, 4, 5).astype('float32')},
492 493
                    fetch_list=lr_var.name,
                )
494 495 496 497 498 499 500 501
            self.assertEqual(out, np.array(python_func(num, **kwarg)))
            scheduler.step()
            num += 1

        for epoch in range(5):
            for batch_id in range(2):
                out = exe.run(
                    test_prog,
502
                    feed={'x': np.random.randn(3, 4, 5).astype('float32')},
503 504
                    fetch_list=lr_var.name,
                )
505 506 507 508 509 510
            self.assertEqual(out, np.array(python_func(num, **kwarg)))
            scheduler.step()
            num += 1

        if isinstance(place, paddle.CPUPlace):
            compiled_train_prog = paddle.static.CompiledProgram(
511 512 513 514
                main_prog
            ).with_data_parallel(
                loss_name=loss.name, places=fluid.cpu_places(4)
            )
515 516 517
            for epoch in range(5):
                python_result = python_func(num, **kwarg)
                for batch_id in range(2):
518 519 520
                    _ = exe.run(
                        compiled_train_prog,
                        feed={'x': np.random.randn(12, 4, 5).astype('float32')},
521 522
                        fetch_list=lr_var.name,
                    )
523 524 525 526 527 528 529 530 531 532 533 534 535
                scopes = compiled_train_prog._executor.local_scopes()
                out = np.array(scopes[0].var(lr_var.name).get_tensor())
                self.assertEqual(out, np.array(python_result))
                out = np.array(scopes[1].var(lr_var.name).get_tensor())
                self.assertEqual(out, np.array(python_result))
                out = np.array(scopes[2].var(lr_var.name).get_tensor())
                self.assertEqual(out, np.array(python_result))
                out = np.array(scopes[3].var(lr_var.name).get_tensor())
                self.assertEqual(out, np.array(python_result))
                scheduler.step()
                num += 1

            compiled_test_prog = paddle.static.CompiledProgram(
536 537 538 539 540 541
                test_prog
            ).with_data_parallel(
                loss_name=loss.name,
                share_vars_from=compiled_train_prog,
                places=fluid.cpu_places(4),
            )
542 543 544
            for epoch in range(5):
                python_result = python_func(num, **kwarg)
                for batch_id in range(2):
545 546 547
                    _ = exe.run(
                        compiled_test_prog,
                        feed={'x': np.random.randn(12, 4, 5).astype('float32')},
548 549
                        fetch_list=lr_var.name,
                    )
550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565
                scopes = compiled_test_prog._executor.local_scopes()
                out = np.array(scopes[0].var(lr_var.name).get_tensor())
                self.assertEqual(out, np.array(python_result))
                out = np.array(scopes[1].var(lr_var.name).get_tensor())
                self.assertEqual(out, np.array(python_result))
                out = np.array(scopes[2].var(lr_var.name).get_tensor())
                self.assertEqual(out, np.array(python_result))
                out = np.array(scopes[3].var(lr_var.name).get_tensor())
                self.assertEqual(out, np.array(python_result))
                scheduler.step()
                num += 1

    def _test_dygraph(self, python_func, paddle_api, kwarg, place):
        paddle.disable_static(place)
        x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
        linear = paddle.nn.Linear(10, 10)
566 567
        if paddle_api.__name__ == "LinearWarmup":
            kwarg['learning_rate'] = paddle.optimizer.lr.PiecewiseDecay(
568 569
                [3, 6], [0.5, 0.2, 0.1]
            )
570
        scheduler = paddle_api(**kwarg)
571 572 573
        adam = paddle.optimizer.Adam(
            learning_rate=scheduler, parameters=linear.parameters()
        )
574 575 576 577
        for epoch in range(20):
            for batch_id in range(2):
                x = paddle.to_tensor(x)
                out = linear(x)
C
chentianyu03 已提交
578
                loss = paddle.mean(out)
579 580 581 582 583
                loss.backward()
                adam.step()
                adam.clear_grad()
            current_lr = adam.get_lr()
            expected_lr = python_func(epoch, **kwarg)
584
            if paddle_api.__name__ == "CosineAnnealingDecay":
585 586
                self.assertAlmostEqual(current_lr, expected_lr)
                scheduler.step(epoch + 1)
587 588 589 590
            elif paddle_api.__name__ == "LinearWarmup":
                self.assertAlmostEqual(current_lr, expected_lr)
                state_dict = adam.state_dict()
                scheduler1 = paddle.optimizer.lr.LinearWarmup(**kwarg)
591 592 593
                adam1 = paddle.optimizer.Adam(
                    learning_rate=scheduler1, parameters=linear.parameters()
                )
594 595 596
                adam1.set_state_dict(state_dict)
                self.assertEqual(scheduler.last_epoch, scheduler1.last_epoch)
                self.assertEqual(scheduler.last_lr, scheduler1.last_lr)
597 598 599 600 601 602 603 604
                self.assertEqual(
                    scheduler.learning_rate.last_lr,
                    scheduler1.learning_rate.last_lr,
                )
                self.assertEqual(
                    scheduler.learning_rate.last_epoch,
                    scheduler1.learning_rate.last_epoch,
                )
605 606 607 608
                scheduler.step()
            else:
                self.assertEqual(current_lr, expected_lr)
                scheduler.step()
609 610 611

    def test_scheduler(self):
        with self.assertRaises(NotImplementedError):
612
            paddle.optimizer.lr.LRScheduler().step()
613
        with self.assertRaises(TypeError):
614 615 616
            paddle.optimizer.lr.MultiStepDecay(
                learning_rate="test", milestones=[1, 2, 3]
            )
617
        with self.assertRaises(TypeError):
618 619 620
            paddle.optimizer.lr.MultiStepDecay(
                learning_rate=0.5, milestones='test'
            )
621
        with self.assertRaises(ValueError):
622 623 624
            paddle.optimizer.lr.MultiStepDecay(
                learning_rate=0.5, milestones=[3, 2, 1]
            )
625
        with self.assertRaises(ValueError):
626 627 628
            paddle.optimizer.lr.MultiStepDecay(
                learning_rate=0.5, milestones=[1, 2, 3], gamma=2
            )
629
        # check type of max_learning_rate
630
        with self.assertRaises(TypeError):
631 632 633
            paddle.optimizer.lr.OneCycleLR(
                max_learning_rate='test', total_steps=20
            )
634
        # check value of max_learning_rate
635
        with self.assertRaises(ValueError):
636 637 638
            paddle.optimizer.lr.OneCycleLR(
                max_learning_rate=-1.5, total_steps=20
            )
639
        # check type of end_learning_rate
640
        with self.assertRaises(TypeError):
641 642 643
            paddle.optimizer.lr.OneCycleLR(
                max_learning_rate=0.1, total_steps=20, end_learning_rate='test'
            )
644
        # check value of end_learning_rate
645
        with self.assertRaises(ValueError):
646 647 648
            paddle.optimizer.lr.OneCycleLR(
                max_learning_rate=0.1, total_steps=20, end_learning_rate=-1
            )
649
        # check type of total_steps
650
        with self.assertRaises(TypeError):
651 652 653
            paddle.optimizer.lr.OneCycleLR(
                max_learning_rate=0.1, total_steps='test'
            )
654
        # check value of total_steps
655
        with self.assertRaises(ValueError):
656 657 658
            paddle.optimizer.lr.OneCycleLR(
                max_learning_rate=0.1, total_steps=-10
            )
659
        # check value of anneal_strategy
660
        with self.assertRaises(ValueError):
661 662 663
            paddle.optimizer.lr.OneCycleLR(
                max_learning_rate=0.1, total_steps=20, anneal_strategy='test'
            )
664
        # check value of phase_pct when three_phase is True
665
        with self.assertRaises(ValueError):
666 667 668 669 670 671
            paddle.optimizer.lr.OneCycleLR(
                max_learning_rate=0.1,
                total_steps=20,
                phase_pct=0.6,
                three_phase=True,
            )
672 673
        # check type of max_learning_rate
        with self.assertRaises(TypeError):
674 675 676 677 678
            paddle.optimizer.lr.CyclicLR(
                base_learning_rate=0.5,
                max_learning_rate='test',
                step_size_up=10,
            )
679 680
        # check value of max_learning_rate
        with self.assertRaises(ValueError):
681 682 683
            paddle.optimizer.lr.CyclicLR(
                base_learning_rate=0.5, max_learning_rate=-1, step_size_up=10
            )
684 685
        # check type of step_size_up
        with self.assertRaises(TypeError):
686 687 688 689 690
            paddle.optimizer.lr.CyclicLR(
                base_learning_rate=0.5,
                max_learning_rate=1.0,
                step_size_up='test',
            )
691 692
        # check value of step_size_up
        with self.assertRaises(ValueError):
693 694 695
            paddle.optimizer.lr.CyclicLR(
                base_learning_rate=0.5, max_learning_rate=1.0, step_size_up=-1
            )
696 697
        # check type of step_size_down
        with self.assertRaises(TypeError):
698 699 700 701 702 703
            paddle.optimizer.lr.CyclicLR(
                base_learning_rate=0.5,
                max_learning_rate=1.0,
                step_size_up=500,
                step_size_down='test',
            )
704 705
        # check type of step_size_down
        with self.assertRaises(ValueError):
706 707 708 709 710 711
            paddle.optimizer.lr.CyclicLR(
                base_learning_rate=0.5,
                max_learning_rate=1.0,
                step_size_up=500,
                step_size_down=-1,
            )
712 713
        # check value of mode
        with self.assertRaises(ValueError):
714 715 716 717 718 719 720
            paddle.optimizer.lr.CyclicLR(
                base_learning_rate=0.5,
                max_learning_rate=1.0,
                step_size_up=500,
                step_size_down=500,
                mode='test',
            )
721 722
        # check type value of scale_mode
        with self.assertRaises(ValueError):
723 724 725 726 727 728 729
            paddle.optimizer.lr.CyclicLR(
                base_learning_rate=0.5,
                max_learning_rate=1.0,
                step_size_up=500,
                step_size_down=-1,
                scale_mode='test',
            )
730 731

        func_api_kwargs = [
732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962
            (
                noam_lr,
                paddle.optimizer.lr.NoamDecay,
                {"d_model": 0.01, "warmup_steps": 100, "verbose": False},
            ),
            (
                piecewise_lr,
                paddle.optimizer.lr.PiecewiseDecay,
                {
                    "boundaries": [3, 6, 9, 15, 20],
                    "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
                    "verbose": False,
                },
            ),
            (
                natural_exp_lr,
                paddle.optimizer.lr.NaturalExpDecay,
                {"learning_rate": 0.5, "gamma": 0.1, "verbose": True},
            ),
            (
                inverse_time_lr,
                paddle.optimizer.lr.InverseTimeDecay,
                {"learning_rate": 0.5, "gamma": 0.1, "verbose": False},
            ),
            (
                polynomial_lr,
                paddle.optimizer.lr.PolynomialDecay,
                {
                    "learning_rate": 0.5,
                    "decay_steps": 20,
                    "end_lr": 0,
                    "power": 1.0,
                    "cycle": False,
                },
            ),
            (
                polynomial_lr,
                paddle.optimizer.lr.PolynomialDecay,
                {
                    "learning_rate": 0.5,
                    "decay_steps": 20,
                    "end_lr": 0,
                    "power": 1.0,
                    "cycle": True,
                    "verbose": False,
                },
            ),
            (
                linear_warmup_lr,
                paddle.optimizer.lr.LinearWarmup,
                {
                    'learning_rate': 0.5,
                    'warmup_steps': 10,
                    'start_lr': 0,
                    'end_lr': 0.5,
                },
            ),
            (
                exponential_lr,
                paddle.optimizer.lr.ExponentialDecay,
                {"learning_rate": 0.5, "gamma": 0.9, "verbose": False},
            ),
            (
                multi_step_lr,
                paddle.optimizer.lr.MultiStepDecay,
                {
                    "learning_rate": 0.5,
                    "milestones": [3, 6, 9, 15, 20],
                    "gamma": 0.8,
                },
            ),
            (
                step_lr,
                paddle.optimizer.lr.StepDecay,
                {
                    "learning_rate": 0.5,
                    "step_size": 2,
                    "gamma": 0.8,
                    "verbose": False,
                },
            ),
            (
                lambda_lr,
                paddle.optimizer.lr.LambdaDecay,
                {
                    "learning_rate": 0.5,
                    "lr_lambda": lambda x: 0.95**x,
                    "verbose": True,
                },
            ),
            (
                multiplicative_lr,
                paddle.optimizer.lr.MultiplicativeDecay,
                {
                    "learning_rate": 0.5,
                    "lr_lambda": lambda x: 0.95,
                    "verbose": True,
                },
            ),
            (
                cosine_annealing_lr,
                paddle.optimizer.lr.CosineAnnealingDecay,
                {"learning_rate": 0.5, "T_max": 10, "verbose": False},
            ),
            (
                one_cycle_lr,
                paddle.optimizer.lr.OneCycleLR,
                {
                    "max_learning_rate": 0.1,
                    "total_steps": 20,
                    "divide_factor": 5,
                    "end_learning_rate": 0.0001,
                    "anneal_strategy": 'cos',
                    "phase_pct": 0.3,
                    "three_phase": False,
                },
            ),
            (
                one_cycle_lr,
                paddle.optimizer.lr.OneCycleLR,
                {
                    "max_learning_rate": 0.5,
                    "total_steps": 20,
                    "divide_factor": 10,
                    "end_learning_rate": 0.001,
                    "anneal_strategy": 'linear',
                    "phase_pct": 0.4,
                    "three_phase": False,
                },
            ),
            (
                one_cycle_lr,
                paddle.optimizer.lr.OneCycleLR,
                {
                    "max_learning_rate": 1.0,
                    "total_steps": 20,
                    "divide_factor": 9,
                    "end_learning_rate": 0.0001,
                    "anneal_strategy": 'cos',
                    "phase_pct": 0.3,
                    "three_phase": True,
                },
            ),
            (
                one_cycle_lr,
                paddle.optimizer.lr.OneCycleLR,
                {
                    "max_learning_rate": 0.3,
                    "total_steps": 20,
                    "divide_factor": 25,
                    "end_learning_rate": 0.0005,
                    "anneal_strategy": 'linear',
                    "phase_pct": 0.2,
                    "three_phase": True,
                },
            ),
            (
                cyclic_lr,
                paddle.optimizer.lr.CyclicLR,
                {
                    "base_learning_rate": 0.5,
                    "max_learning_rate": 1.0,
                    "step_size_up": 15,
                    "step_size_down": 5,
                    "mode": 'triangular',
                    "exp_gamma": 1.0,
                    "scale_fn": None,
                    "scale_mode": 'cycle',
                    "verbose": False,
                },
            ),
            (
                cyclic_lr,
                paddle.optimizer.lr.CyclicLR,
                {
                    "base_learning_rate": 0.5,
                    "max_learning_rate": 1.0,
                    "step_size_up": 15,
                    "step_size_down": 5,
                    "mode": 'triangular2',
                    "exp_gamma": 1.0,
                    "scale_fn": None,
                    "scale_mode": 'cycle',
                    "verbose": False,
                },
            ),
            (
                cyclic_lr,
                paddle.optimizer.lr.CyclicLR,
                {
                    "base_learning_rate": 0.5,
                    "max_learning_rate": 1.0,
                    "step_size_up": 15,
                    "step_size_down": 5,
                    "mode": 'exp_range',
                    "exp_gamma": 0.8,
                    "scale_fn": None,
                    "scale_mode": 'cycle',
                    "verbose": False,
                },
            ),
            (
                cyclic_lr,
                paddle.optimizer.lr.CyclicLR,
                {
                    "base_learning_rate": 0.5,
                    "max_learning_rate": 1.0,
                    "step_size_up": 15,
                    "step_size_down": 5,
                    "mode": 'exp_range',
                    "exp_gamma": 1.0,
                    "scale_fn": lambda x: 0.95**x,
                    "scale_mode": 'cycle',
                    "verbose": False,
                },
            ),
            (
                cyclic_lr,
                paddle.optimizer.lr.CyclicLR,
                {
                    "base_learning_rate": 0.5,
                    "max_learning_rate": 1.0,
                    "step_size_up": 15,
                    "step_size_down": 5,
                    "mode": 'exp_range',
                    "exp_gamma": 1.0,
                    "scale_fn": lambda x: 0.95,
                    "scale_mode": 'iterations',
                    "verbose": False,
                },
            ),
963
        ]
964 965 966 967 968 969 970 971

        for python_func, paddle_api, kwarg in func_api_kwargs:
            places = [paddle.CPUPlace()]
            if core.is_compiled_with_cuda():
                places.append(paddle.CUDAPlace(0))

            for place in places:
                paddle.enable_static()
972
                self._test_static(python_func, paddle_api, kwarg, place)
973 974 975 976
                paddle.disable_static(place)
                self._test_dygraph(python_func, paddle_api, kwarg, place)
                paddle.enable_static()

977
    def test_linear_warmp(self):
978 979 980
        natural_lr = paddle.optimizer.lr.NaturalExpDecay(
            learning_rate=0.5, gamma=0.1
        )
981
        natural_lr_warmup = paddle.optimizer.lr.LinearWarmup(
982 983
            learning_rate=natural_lr, warmup_steps=10, start_lr=0.0, end_lr=0.1
        )
984 985
        for idx in range(30):
            if idx >= 10:
986 987 988
                self.assertEqual(
                    natural_lr_warmup.get_lr(), natural_lr.get_lr()
                )
989 990 991
                natural_lr.step()
            natural_lr_warmup.step()

992 993

if __name__ == '__main__':
H
hong 已提交
994
    paddle.enable_static()
995
    unittest.main()