test_learning_rate_scheduler.py 20.8 KB
Newer Older
Q
Qiao Longfei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import copy
16
import math
17
import numpy as np
18
import unittest
19

20
import paddle.fluid as fluid
21
import paddle.fluid.layers as layers
22
import paddle.fluid.framework as framework
Q
QI JUN 已提交
23
import paddle.fluid.core as core
Q
Qiao Longfei 已提交
24 25 26 27 28 29 30


def exponential_decay(learning_rate,
                      global_step,
                      decay_steps,
                      decay_rate,
                      staircase=False):
Y
Yu Yang 已提交
31
    exponent = global_step / decay_steps
Q
Qiao Longfei 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
    if staircase:
        exponent = math.floor(exponent)
    return learning_rate * decay_rate**exponent


def natural_exp_decay(learning_rate,
                      global_step,
                      decay_steps,
                      decay_rate,
                      staircase=False):
    exponent = float(global_step) / float(decay_steps)
    if staircase:
        exponent = math.floor(exponent)
    return learning_rate * math.exp(-1 * decay_rate * exponent)


def inverse_time_decay(learning_rate,
                       global_step,
                       decay_steps,
                       decay_rate,
                       staircase=False):
    temp = float(global_step) / float(decay_steps)
    if staircase:
        temp = math.floor(temp)
    return learning_rate / (1 + decay_rate * temp)


59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
def polynomial_decay(learning_rate,
                     global_step,
                     decay_steps,
                     end_learning_rate=0.0001,
                     power=1.0,
                     cycle=False):
    if cycle:
        div = math.ceil(global_step / float(decay_steps))
        if div == 0:
            div = 1
        decay_steps = decay_steps * div
    else:
        global_step = min(global_step, decay_steps)
    return (learning_rate - end_learning_rate) * \
           ((1 - float(global_step) / float(decay_steps)) ** power) + end_learning_rate


def piecewise_decay(global_step, boundaries, values):
    assert len(boundaries) + 1 == len(values)
    for i in range(len(boundaries)):
        if global_step < boundaries[i]:
            return values[i]
    return values[len(values) - 1]
Q
Qiao Longfei 已提交
82

83

S
shippingwang 已提交
84 85
def cosine_decay(global_step, learning_rate, step_each_epoch, epochs):
    cur_epoch = math.floor(global_step / step_each_epoch)
86 87
    decayed_lr = learning_rate * 0.5 * (math.cos(cur_epoch * math.pi / epochs) +
                                        1)
S
shippingwang 已提交
88 89 90
    return decayed_lr


91 92 93 94 95 96 97 98
def noam_decay(global_step, d_model, warmup_steps, learning_rate=1.0):
    a = math.pow(global_step, -0.5)
    b = math.pow(warmup_steps, -1.5) * global_step
    decayed_lr = learning_rate * math.pow(d_model, -0.5) * min(a, b)

    return decayed_lr


99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
def linear_lr_warmup(global_step, warmup_steps, start_lr, end_lr):
    linear_step = end_lr - start_lr
    decayed_lr = start_lr + linear_step * (global_step / warmup_steps)
    return decayed_lr


def multi_step_decay(global_step, learning_rate, milestones, decay_rate=0.1):
    for i in range(len(milestones)):
        if global_step < milestones[i]:
            return learning_rate * math.pow(decay_rate, i)

    return learning_rate * math.pow(decay_rate, len(milestones))


def step_decay(global_step, learning_rate, step_size, decay_rate=0.1):
    return learning_rate * math.pow(decay_rate, global_step // step_size)


117 118 119 120
def lambda_decay(global_step, learning_rate, lr_lambda):
    return learning_rate * lr_lambda(global_step)


121
class TestLearningRateDecayDygraph(unittest.TestCase):
122

123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
    def test_LR_state_dict(self):
        with fluid.dygraph.guard():
            x = np.random.uniform(-1, 1, [3, 10]).astype("float32")
            linear = fluid.dygraph.Linear(10, 10)
            input = fluid.dygraph.to_variable(x)

            Exponential_scheduler = fluid.dygraph.ExponentialDecay(
                learning_rate=0.1,
                decay_steps=10000,
                decay_rate=0.5,
                staircase=True)
            Step_scheduler = fluid.dygraph.StepDecay(0.5, step_size=3)
            Reducelr_scheduler = fluid.dygraph.ReduceLROnPlateau(
                learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3)

138 139 140 141 142 143
            adam1 = fluid.optimizer.Adam(learning_rate=Exponential_scheduler,
                                         parameter_list=linear.parameters())
            adam2 = fluid.optimizer.Adam(learning_rate=Step_scheduler,
                                         parameter_list=linear.parameters())
            adam3 = fluid.optimizer.Adam(learning_rate=Reducelr_scheduler,
                                         parameter_list=linear.parameters())
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
            print(adam3.state_dict())

            for epoch in range(10):
                out = linear(input)
                loss = fluid.layers.reduce_mean(out)
                loss.backward()
                adam1.minimize(loss)
                adam2.minimize(loss)
                adam3.minimize(loss)
                linear.clear_gradients()

                Step_scheduler.epoch()
                Reducelr_scheduler.step(loss)

            fluid.dygraph.save_dygraph(linear.state_dict(), "save_path")

            Exponential_scheduler_test = fluid.dygraph.ExponentialDecay(
                learning_rate=0.1,
                decay_steps=10000,
                decay_rate=0.5,
                staircase=True)
            Step_scheduler_test = fluid.dygraph.StepDecay(0.5, step_size=3)
            Reducelr_scheduler_test = fluid.dygraph.ReduceLROnPlateau(
                learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3)

            fluid.dygraph.save_dygraph(adam1.state_dict(), "save_path")
            _, opt_state = fluid.dygraph.load_dygraph("save_path")
            adam_test = fluid.optimizer.Adam(
                learning_rate=Exponential_scheduler_test,
                parameter_list=linear.parameters())
            adam_test.set_dict(opt_state)
175 176 177 178
            self.assertEqual(
                adam_test._learning_rate.step_num,
                adam1._learning_rate.step_num,
                "epoch_num is different before and after set_dict")
179 180 181

            fluid.dygraph.save_dygraph(adam2.state_dict(), "save_path")
            _, opt_state = fluid.dygraph.load_dygraph("save_path")
182 183
            adam_test = fluid.optimizer.Adam(learning_rate=Step_scheduler_test,
                                             parameter_list=linear.parameters())
184 185
            adam_test.set_dict(opt_state)
            self.assertEqual(
186 187 188 189 190
                adam_test._learning_rate.epoch_num,
                adam2._learning_rate.epoch_num,
                "epoch_num is different before and after set_dict")
            self.assertEqual(
                adam_test._learning_rate(), adam2._learning_rate(),
191 192 193 194 195 196 197 198
                "current learning rate is different before and after set_dict")

            fluid.dygraph.save_dygraph(adam3.state_dict(), "save_path")
            _, opt_state = fluid.dygraph.load_dygraph("save_path")
            adam_test = fluid.optimizer.Adam(
                learning_rate=Reducelr_scheduler_test,
                parameter_list=linear.parameters())
            adam_test.set_dict(opt_state)
199 200 201 202
            self.assertEqual(
                adam_test._learning_rate.best_loss,
                adam3._learning_rate.best_loss.numpy()[0],
                "best_loss is different before and after set_dict")
203 204 205 206 207 208 209 210 211 212 213 214
            self.assertEqual(
                adam_test._learning_rate.cooldown_counter,
                adam3._learning_rate.cooldown_counter,
                "cooldown_counter is different before and after set_dict")
            self.assertEqual(
                adam_test._learning_rate.num_bad_epochs,
                adam3._learning_rate.num_bad_epochs,
                "num_bad_epochs is different before and after set_dict")
            self.assertEqual(adam_test._learning_rate.epoch_num,
                             adam3._learning_rate.epoch_num,
                             "epoch is different before and after set_dict")
            self.assertEqual(
215
                adam_test._learning_rate(), adam3._learning_rate(),
216 217
                "current learning rate is different before and after set_dict")

218
    def test_NoamDecay(self):
219 220 221 222 223 224 225 226 227 228 229 230 231 232
        with fluid.dygraph.guard():
            d_model = 0.01
            warmup_steps = 200
            learning_rate = 2.0
            lr = fluid.layers.noam_decay(d_model, warmup_steps, learning_rate)
            for step in range(5):
                step += 1
                right_result = noam_decay(step, d_model, warmup_steps,
                                          learning_rate)
                fluid_result = lr()

                self.assertAlmostEqual(
                    right_result,
                    fluid_result[0],
233 234 235
                    msg=
                    'Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'
                    .format(step, right_result, fluid_result[0]))
236

237 238
    def test_LinearLrWarmup(self):
        with fluid.dygraph.guard():
239 240 241 242 243 244 245 246
            lr = fluid.layers.polynomial_decay(learning_rate=1.0,
                                               decay_steps=10,
                                               end_learning_rate=0.0,
                                               power=1.0)
            lr = fluid.layers.linear_lr_warmup(learning_rate=lr,
                                               warmup_steps=2,
                                               start_lr=0.0,
                                               end_lr=1.0)
247 248 249 250 251 252

            right_result = [0.5, 0.9, 0.8, 0.7, 0.6]
            for i in range(5):

                t = lr()

253 254 255
                np.testing.assert_allclose(t.numpy()[0].item(),
                                           right_result[i],
                                           rtol=1e-05)
256 257

            with self.assertRaises(TypeError):
258 259 260 261
                lr = fluid.layers.linear_lr_warmup(learning_rate="fake_lr",
                                                   warmup_steps=2,
                                                   start_lr=0.0,
                                                   end_lr=1.0)
262 263 264 265 266 267

    def test_MultiStepDecay(self):
        with fluid.dygraph.guard():
            learning_rate = 0.5
            milestones = [2, 4, 8]
            decay_rate = 0.2
268 269
            linear = fluid.dygraph.Linear(10, 10)

270 271
            scheduler = fluid.dygraph.MultiStepDecay(learning_rate, milestones,
                                                     decay_rate)
272 273 274

            adam = fluid.optimizer.AdamOptimizer(
                learning_rate=scheduler, parameter_list=linear.parameters())
275 276 277
            for epoch in range(10):
                right_result = multi_step_decay(epoch, learning_rate,
                                                milestones, decay_rate)
278
                fluid_result = adam.current_step_lr()
279 280 281 282
                scheduler.epoch()
                self.assertAlmostEqual(
                    right_result,
                    fluid_result,
283 284 285
                    msg=
                    'Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'
                    .format(epoch, right_result, fluid_result))
286 287 288 289 290 291 292 293 294 295 296 297 298

            with self.assertRaises(ValueError):
                lr = fluid.dygraph.MultiStepDecay(learning_rate, [30, 50, 20],
                                                  0.1)

            with self.assertRaises(ValueError):
                lr = fluid.dygraph.MultiStepDecay(learning_rate, [20, 30, 50],
                                                  1)

            with self.assertRaises(TypeError):
                lr = fluid.dygraph.MultiStepDecay("test", [20, 30, 50])

            with self.assertRaises(ValueError):
299
                lr = fluid.dygraph.MultiStepDecay(-1, [20, 30, 50])
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315

    def test_StepDecay(self):
        with fluid.dygraph.guard():
            learning_rate = 0.5
            step_size = 3
            decay_rate = 0.2
            scheduler = fluid.dygraph.StepDecay(learning_rate, step_size,
                                                decay_rate)
            for epoch in range(10):
                right_result = step_decay(epoch, learning_rate, step_size,
                                          decay_rate)
                fluid_result = scheduler().numpy()[0]
                scheduler.epoch()
                self.assertAlmostEqual(
                    right_result,
                    fluid_result,
316 317 318
                    msg=
                    'Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'
                    .format(epoch, right_result, fluid_result))
319 320

            with self.assertRaises(TypeError):
321
                lr = fluid.dygraph.StepDecay(learning_rate, "test", 0.1)
322 323

            with self.assertRaises(ValueError):
324
                lr = fluid.dygraph.StepDecay(learning_rate, 20, 2)
325

326 327 328 329 330 331 332
    def test_LambdaDecay(self):
        with fluid.dygraph.guard():
            learning_rate = 0.5
            lr_lambda = lambda x: 0.95**x
            scheduler = fluid.dygraph.LambdaDecay(learning_rate, lr_lambda)

            linear = fluid.dygraph.nn.Linear(10, 10)
333 334
            adam = fluid.optimizer.Adam(scheduler,
                                        parameter_list=linear.parameters())
335 336 337 338 339 340 341 342

            for epoch in range(30):
                right_result = lambda_decay(epoch, learning_rate, lr_lambda)
                fluid_result = scheduler().numpy()[0]
                scheduler.epoch()
                self.assertAlmostEqual(
                    right_result,
                    fluid_result,
343 344 345
                    msg=
                    'Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'
                    .format(epoch, right_result, fluid_result))
346 347 348 349

            with self.assertRaises(TypeError):
                lr = fluid.dygraph.LambdaDecay(learning_rate, "test")

350

351
class TestLearningRateDecay(unittest.TestCase):
352

353
    def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
Q
QI JUN 已提交
354 355 356 357 358 359 360 361 362
        places = [fluid.CPUPlace()]
        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
        for place in places:
            self.check_decay_with_place(place, python_decay_fn, fluid_decay_fn,
                                        kwargs)

    def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
                               kwargs):
363 364
        main_prog = fluid.Program()
        startup_prog = fluid.Program()
Q
QI JUN 已提交
365

366
        with fluid.program_guard(main_prog, startup_prog):
367
            decayed_lr = fluid_decay_fn(**kwargs)
Q
Qiao Longfei 已提交
368 369 370 371

        place = fluid.CPUPlace()
        exe = fluid.Executor(place)

372
        exe.run(startup_prog)
373

Q
Qiao Longfei 已提交
374
        for step in range(10):
375 376 377
            # Step of NoamDecay starts from 1.
            if python_decay_fn.__name__ == 'noam_decay':
                step += 1
378
            lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
379 380
            python_decayed_lr = python_decay_fn(global_step=float(step),
                                                **kwargs)
Y
Yu Yang 已提交
381 382 383
            self.assertAlmostEqual(
                python_decayed_lr,
                lr_val[0],
384 385 386 387
                msg=
                'Failed lr scheduler is {0}, step {1}, Python result is {2}, Fluid result is {3}'
                .format(python_decay_fn.__name__, str(step),
                        str(python_decayed_lr), str(lr_val[0])))
Q
Qiao Longfei 已提交
388 389

    def test_decay(self):
390 391 392 393 394 395 396 397 398
        common_kwargs_true = {
            "learning_rate": 1.0,
            "decay_steps": 5,
            "decay_rate": 0.5,
            "staircase": True
        }
        common_kwargs_false = copy.deepcopy(common_kwargs_true)
        common_kwargs_false["staircase"] = False

Q
Qiao Longfei 已提交
399
        decay_fns = [
400 401 402 403 404 405
            (exponential_decay, layers.exponential_decay, common_kwargs_true),
            (exponential_decay, layers.exponential_decay, common_kwargs_false),
            (natural_exp_decay, layers.natural_exp_decay, common_kwargs_true),
            (natural_exp_decay, layers.natural_exp_decay, common_kwargs_false),
            (inverse_time_decay, layers.inverse_time_decay, common_kwargs_true),
            (inverse_time_decay, layers.inverse_time_decay,
406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
             common_kwargs_false),
            (polynomial_decay, layers.polynomial_decay, {
                "learning_rate": 1.0,
                "decay_steps": 5,
                "cycle": True
            }),
            (polynomial_decay, layers.polynomial_decay, {
                "learning_rate": 1.0,
                "decay_steps": 5,
                "cycle": False
            }),
            (piecewise_decay, layers.piecewise_decay, {
                "boundaries": [3, 6, 9],
                "values": [0.1, 0.2, 0.3, 0.4]
            }),
            (cosine_decay, layers.cosine_decay, {
                "learning_rate": 0.1,
                "step_each_epoch": 100,
                "epochs": 120
            }),
            (noam_decay, layers.noam_decay, {
                "d_model": 0.01,
                "warmup_steps": 200,
                "learning_rate": 2.0
            })
Q
Qiao Longfei 已提交
431 432
        ]

433
        for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
434
            print("class=" + self.__class__.__name__ + " decay_fn=" +
435
                  py_decay_fn.__name__ + " kwargs=" + str(kwargs))
Q
Qiao Longfei 已提交
436 437 438
            main_program = framework.Program()
            startup_program = framework.Program()
            with framework.program_guard(main_program, startup_program):
439
                self.check_decay(py_decay_fn, fluid_decay_fn, kwargs)
Q
Qiao Longfei 已提交
440 441


442
class TestLinearWamrupLearningRateDecay(unittest.TestCase):
443

444 445 446 447 448 449
    def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
                               kwargs):
        main_prog = fluid.Program()
        startup_prog = fluid.Program()

        warmup_steps = 10
Q
qingqing01 已提交
450
        start_lr = 0.1 / 3.
451 452 453
        end_lr = 0.1

        with fluid.program_guard(main_prog, startup_prog):
454 455
            decayed_lr = layers.linear_lr_warmup(fluid_decay_fn(**kwargs),
                                                 warmup_steps, start_lr, end_lr)
456 457 458 459 460 461

        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_prog)

        for step in range(20):
462 463 464
            # Step of NoamDecay starts from 1.
            if fluid_decay_fn.__name__ == 'noam_decay':
                step += 1
465 466
            lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
            if step < warmup_steps:
467 468
                python_decayed_lr = linear_lr_warmup(float(step), warmup_steps,
                                                     start_lr, end_lr)
469
            else:
470 471
                python_decayed_lr = python_decay_fn(global_step=float(step),
                                                    **kwargs)
472 473 474
            self.assertAlmostEqual(
                python_decayed_lr,
                lr_val[0],
475 476 477 478
                msg=
                'Test {0} Failed, step {1}, Python result is {2}, Fluid result is {3}'
                .format(python_decay_fn.__name__, str(step),
                        str(python_decayed_lr), str(lr_val[0])))
479 480


Q
qingqing01 已提交
481
class TestLinearWamrupLearningRateDecayWithScalarInput(unittest.TestCase):
482

Q
qingqing01 已提交
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
    def run_scalar_lr(self, place, lr, start_lr, end_lr):
        main_prog = fluid.Program()
        startup_prog = fluid.Program()

        warmup_steps = 10

        with fluid.program_guard(main_prog, startup_prog):
            decayed_lr = layers.linear_lr_warmup(lr, warmup_steps, start_lr,
                                                 end_lr)

        exe = fluid.Executor(place)
        exe.run(startup_prog)

        for step in range(20):
            lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
            if step < warmup_steps:
499 500
                expected_lr = linear_lr_warmup(float(step), warmup_steps,
                                               start_lr, end_lr)
Q
qingqing01 已提交
501 502 503 504 505 506 507 508 509
            else:
                expected_lr = lr
            self.assertAlmostEqual(
                expected_lr,
                lr_val[0],
                msg='Test failed, step {0}, expected {1}, but got {2}'.format(
                    step, expected_lr, lr_val[0]))

    def test_scalar_lr(self):
510

Q
qingqing01 已提交
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
        def run_places(lr, start_lr, end_lr):
            places = [fluid.CPUPlace()]
            if core.is_compiled_with_cuda():
                places.append(fluid.CUDAPlace(0))
            for p in places:
                self.run_scalar_lr(p, lr, start_lr, end_lr)

        # float
        lr = 0.2
        start_lr = 0.1 / 3.
        end_lr = 0.2
        run_places(lr, start_lr, end_lr)

        # int end_lr
        lr = 2.
        start_lr = 0.1 / 3.
        end_lr = 1
        run_places(lr, start_lr, end_lr)

        # int
        lr = 1
        start_lr = 0
        end_lr = 1
        run_places(lr, start_lr, end_lr)


Q
Qiao Longfei 已提交
537 538
if __name__ == '__main__':
    unittest.main()