test_imperative_optimizer_v2.py 33.2 KB
Newer Older
M
MRXLT 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import itertools
M
MRXLT 已提交
16
import unittest
17

M
MRXLT 已提交
18
import numpy as np
19
from test_imperative_base import new_program_scope
M
MRXLT 已提交
20 21 22

import paddle
import paddle.fluid as fluid
23
from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer
M
MRXLT 已提交
24
from paddle.fluid import core
25
from paddle.fluid.framework import _test_eager_guard
26
from paddle.fluid.optimizer import (
27
    AdadeltaOptimizer,
28 29 30
    AdagradOptimizer,
    AdamaxOptimizer,
    DecayedAdagradOptimizer,
31 32
    DpsgdOptimizer,
    ExponentialMovingAverage,
33
    FtrlOptimizer,
34 35
    LarsMomentumOptimizer,
    LookaheadOptimizer,
36
    ModelAverage,
37
    MomentumOptimizer,
38 39
    PipelineOptimizer,
    RecomputeOptimizer,
40
    RMSPropOptimizer,
41
)
42

M
MRXLT 已提交
43 44 45 46 47 48
# Note(wangzhongpu)
# In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer.


class MLP(fluid.Layer):
    def __init__(self, param_attr=None, bias_attr=None):
49
        super().__init__()
M
MRXLT 已提交
50

51 52
        self._fc1 = paddle.nn.Linear(784, 10)
        self._fc2 = paddle.nn.Linear(10, 10)
M
MRXLT 已提交
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81

    def forward(self, inputs):
        y = self._fc1(inputs)
        y = self._fc2(y)
        return y


class TestImperativeOptimizerBase(unittest.TestCase):
    def setUp(self):
        self.batch_num = 20

    def get_optimizer_dygraph(self, parameter_list):
        raise NotImplementedError()

    def get_optimizer(self):
        raise NotImplementedError()

    def reader_decorator(self, reader):
        def _reader_imple():
            for item in reader():
                image = np.array(item[0]).reshape(1, 784)
                label = np.array(item[1]).astype('int64').reshape(1)
                yield image, label

        return _reader_imple

    def _check_exception(self, exception_message, place=None):
        seed = 90
        batch_size = 128
82
        if place is None:
83 84 85 86 87
            place = (
                fluid.CUDAPlace(0)
                if core.is_compiled_with_cuda()
                else fluid.CPUPlace()
            )
M
MRXLT 已提交
88

89 90
        try:
            paddle.disable_static()
C
cnn 已提交
91
            paddle.seed(seed)
92 93 94
            paddle.framework.random._manual_program_seed(seed)
            mlp = MLP()
            optimizer = self.get_optimizer_dygraph(
95 96
                parameter_list=mlp.parameters()
            )
97 98 99 100
        except Exception as e:
            assert str(e) == exception_message
        finally:
            paddle.enable_static()
M
MRXLT 已提交
101 102 103 104 105

    def _check_mlp(self, place=None):
        seed = 90
        batch_size = 128

106
        if place is None:
107 108 109 110 111
            place = (
                fluid.CPUPlace()
                if not core.is_compiled_with_cuda()
                else fluid.CUDAPlace(0)
            )
M
MRXLT 已提交
112

113
        paddle.disable_static(place)
C
cnn 已提交
114
        paddle.seed(seed)
115
        paddle.framework.random._manual_program_seed(seed)
M
MRXLT 已提交
116

117 118
        mlp = MLP()
        optimizer = self.get_optimizer_dygraph(parameter_list=mlp.parameters())
M
MRXLT 已提交
119

120
        batch_py_reader = fluid.io.PyReader(capacity=1)
121 122 123 124 125 126 127 128
        batch_py_reader.decorate_sample_list_generator(
            paddle.batch(
                self.reader_decorator(paddle.dataset.mnist.train()),
                batch_size=batch_size,
                drop_last=True,
            ),
            places=fluid.CPUPlace(),
        )
M
MRXLT 已提交
129

130 131 132 133
        dy_param_init_value = {}
        for batch_id, data in enumerate(batch_py_reader()):
            if batch_id >= self.batch_num:
                break
M
MRXLT 已提交
134

135 136
            img = data[0]
            label = data[1]
M
MRXLT 已提交
137

138
            label.stop_gradient = True
M
MRXLT 已提交
139

140
            img = paddle.reshape(img, shape=[batch_size, -1])
141 142 143
            cost = mlp(img)
            avg_loss = fluid.layers.reduce_mean(cost)
            dy_out = avg_loss.numpy()
M
MRXLT 已提交
144

145
            if batch_id == 0:
M
MRXLT 已提交
146
                for param in mlp.parameters():
147
                    dy_param_init_value[param.name] = param.numpy()
M
MRXLT 已提交
148

149 150
            avg_loss.backward()
            optimizer.minimize(avg_loss)
151 152 153 154 155 156 157
            if isinstance(
                optimizer._learning_rate, paddle.optimizer.lr.LRScheduler
            ):
                if isinstance(
                    optimizer._learning_rate,
                    paddle.optimizer.lr.ReduceOnPlateau,
                ):
158 159 160 161 162 163 164 165 166
                    optimizer._learning_rate.step(avg_loss)
                else:
                    optimizer._learning_rate.step()
            mlp.clear_gradients()
            dy_param_value = {}
            for param in mlp.parameters():
                dy_param_value[param.name] = param.numpy()

        paddle.enable_static()
M
MRXLT 已提交
167
        with new_program_scope():
C
cnn 已提交
168
            paddle.seed(seed)
L
Leo Chen 已提交
169
            paddle.framework.random._manual_program_seed(seed)
M
MRXLT 已提交
170

171
            if place is None:
172 173 174 175 176
                place = (
                    fluid.CPUPlace()
                    if not core.is_compiled_with_cuda()
                    else fluid.CUDAPlace(0)
                )
M
MRXLT 已提交
177 178 179 180 181

            exe = fluid.Executor(place)

            mlp = MLP()
            optimizer = self.get_optimizer()
182 183 184
            train_reader = paddle.batch(
                paddle.dataset.mnist.train(), batch_size=128, drop_last=True
            )
M
MRXLT 已提交
185

186 187 188
            img = fluid.layers.data(
                name='pixel', shape=[1, 28, 28], dtype='float32'
            )
M
MRXLT 已提交
189
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
190
            img = paddle.reshape(img, shape=[batch_size, 784])
M
MRXLT 已提交
191 192 193 194 195 196 197 198 199 200
            cost = mlp(img)
            avg_loss = fluid.layers.reduce_mean(cost)
            optimizer.minimize(avg_loss)

            # initialize params and fetch them
            static_param_init_value = {}
            static_param_name_list = []
            for param in mlp.parameters():
                static_param_name_list.append(param.name)

201 202 203 204
            out = exe.run(
                fluid.default_startup_program(),
                fetch_list=static_param_name_list,
            )
M
MRXLT 已提交
205 206 207 208 209 210 211 212 213

            for i in range(len(static_param_name_list)):
                static_param_init_value[static_param_name_list[i]] = out[i]

            for batch_id, data in enumerate(train_reader()):
                if batch_id >= self.batch_num:
                    break

                static_x_data = np.array(
214 215 216 217 218 219 220
                    [x[0].reshape(1, 28, 28) for x in data]
                ).astype('float32')
                y_data = (
                    np.array([x[1] for x in data])
                    .astype('int64')
                    .reshape([128, 1])
                )
M
MRXLT 已提交
221 222 223

                fetch_list = [avg_loss.name]
                fetch_list.extend(static_param_name_list)
224 225 226 227 228 229 230 231 232 233 234 235
                out = exe.run(
                    fluid.default_main_program(),
                    feed={"pixel": static_x_data, "label": y_data},
                    fetch_list=fetch_list,
                )
                if isinstance(
                    optimizer._learning_rate, paddle.optimizer.lr.LRScheduler
                ):
                    if isinstance(
                        optimizer._learning_rate,
                        paddle.optimizer.lr.ReduceOnPlateau,
                    ):
236 237 238
                        optimizer._learning_rate.step(out[0])
                    else:
                        optimizer._learning_rate.step()
M
MRXLT 已提交
239 240 241 242 243 244

                static_param_value = {}
                static_out = out[0]
                for i in range(1, len(out)):
                    static_param_value[static_param_name_list[i - 1]] = out[i]

245
        for key, value in static_param_init_value.items():
246 247 248
            np.testing.assert_allclose(
                value, dy_param_init_value[key], rtol=1e-05
            )
M
MRXLT 已提交
249

R
ronnywang 已提交
250
        if core.is_compiled_with_rocm():
251 252 253
            np.testing.assert_allclose(
                static_out, dy_out, rtol=1e-05, atol=0.001
            )
R
ronnywang 已提交
254
        else:
255
            np.testing.assert_allclose(static_out, dy_out, rtol=1e-05)
M
MRXLT 已提交
256

257
        for key, value in static_param_value.items():
R
ronnywang 已提交
258
            if core.is_compiled_with_rocm():
259 260 261
                np.testing.assert_allclose(
                    value, dy_param_value[key], rtol=1e-05, atol=0.001
                )
R
ronnywang 已提交
262
            else:
263 264 265
                np.testing.assert_allclose(
                    value, dy_param_value[key], rtol=1e-05
                )
M
MRXLT 已提交
266 267 268 269 270


class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
        bd = [3, 6, 9]
271 272
        optimizer = paddle.optimizer.SGD(
            learning_rate=paddle.optimizer.lr.PiecewiseDecay(
M
MRXLT 已提交
273
                boundaries=bd,
274 275 276 277
                values=[0.1 * (0.1**i) for i in range(len(bd) + 1)],
            ),
            parameters=parameter_list,
        )
M
MRXLT 已提交
278 279 280 281
        return optimizer

    def get_optimizer(self):
        bd = [3, 6, 9]
282 283 284
        optimizer = paddle.optimizer.SGD(
            learning_rate=paddle.optimizer.lr.PiecewiseDecay(
                boundaries=bd,
285 286 287
                values=[0.1 * (0.1**i) for i in range(len(bd) + 1)],
            )
        )
M
MRXLT 已提交
288 289
        return optimizer

290
    def func_test_sgd(self):
M
MRXLT 已提交
291 292
        self._check_mlp()

293 294 295 296 297
    def test_sgd(self):
        with _test_eager_guard():
            self.func_test_sgd()
        self.func_test_sgd()

M
MRXLT 已提交
298 299 300

class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
301
        optimizer = paddle.optimizer.SGD(
302 303 304 305 306
            learning_rate=paddle.optimizer.lr.NaturalExpDecay(
                learning_rate=0.5, gamma=0.9
            ),
            parameters=parameter_list,
        )
M
MRXLT 已提交
307 308 309
        return optimizer

    def get_optimizer(self):
310
        optimizer = paddle.optimizer.SGD(
311 312 313 314
            learning_rate=paddle.optimizer.lr.NaturalExpDecay(
                learning_rate=0.5, gamma=0.9
            )
        )
M
MRXLT 已提交
315 316
        return optimizer

317
    def func_test_sgd(self):
M
MRXLT 已提交
318 319
        self._check_mlp()

320 321 322 323 324
    def test_sgd(self):
        with _test_eager_guard():
            self.func_test_sgd()
        self.func_test_sgd()

M
MRXLT 已提交
325 326 327

class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
328 329
        optimizer = paddle.optimizer.SGD(
            learning_rate=paddle.optimizer.lr.ExponentialDecay(
330 331 332 333
                learning_rate=0.5, gamma=0.9
            ),
            parameters=parameter_list,
        )
M
MRXLT 已提交
334 335 336
        return optimizer

    def get_optimizer(self):
337 338
        optimizer = paddle.optimizer.SGD(
            learning_rate=paddle.optimizer.lr.ExponentialDecay(
339 340 341
                learning_rate=0.5, gamma=0.9
            )
        )
M
MRXLT 已提交
342 343
        return optimizer

344
    def func_test_sgd(self):
M
MRXLT 已提交
345 346
        self._check_mlp()

347 348 349 350 351
    def test_sgd(self):
        with _test_eager_guard():
            self.func_test_sgd()
        self.func_test_sgd()

M
MRXLT 已提交
352 353 354

class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
355 356
        optimizer = paddle.optimizer.Adam(
            learning_rate=paddle.optimizer.lr.InverseTimeDecay(
357 358 359 360
                learning_rate=0.5, gamma=0.9
            ),
            parameters=parameter_list,
        )
M
MRXLT 已提交
361 362 363
        return optimizer

    def get_optimizer(self):
364 365
        optimizer = paddle.optimizer.Adam(
            learning_rate=paddle.optimizer.lr.InverseTimeDecay(
366 367 368
                learning_rate=0.5, gamma=0.9
            )
        )
M
MRXLT 已提交
369 370
        return optimizer

371
    def func_test_adam(self):
M
MRXLT 已提交
372 373
        self._check_mlp()

374 375 376 377 378
    def test_adam(self):
        with _test_eager_guard():
            self.func_test_adam()
        self.func_test_adam()

M
MRXLT 已提交
379 380 381

class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
382
        optimizer = paddle.optimizer.SGD(
383 384 385 386 387
            learning_rate=paddle.optimizer.lr.PolynomialDecay(
                learning_rate=0.5, decay_steps=5, cycle=self.cycle
            ),
            parameters=parameter_list,
        )
M
MRXLT 已提交
388 389 390
        return optimizer

    def get_optimizer(self):
391 392
        optimizer = paddle.optimizer.SGD(
            learning_rate=paddle.optimizer.lr.PolynomialDecay(
393 394 395
                learning_rate=0.5, decay_steps=5, cycle=self.cycle
            )
        )
M
MRXLT 已提交
396 397
        return optimizer

398
    def func_test_sgd_cycle(self):
M
MRXLT 已提交
399 400 401
        self.cycle = True
        self._check_mlp()

402 403 404 405 406 407
    def test_sgd_cycle(self):
        with _test_eager_guard():
            self.func_test_sgd_cycle()
        self.func_test_sgd_cycle()

    def func_test_sgd(self):
M
MRXLT 已提交
408 409 410
        self.cycle = False
        self._check_mlp()

411 412 413 414 415
    def test_sgd(self):
        with _test_eager_guard():
            self.func_test_sgd()
        self.func_test_sgd()

M
MRXLT 已提交
416

417
class TestImperativeOptimizerCosineAnnealingDecay(TestImperativeOptimizerBase):
M
MRXLT 已提交
418
    def get_optimizer_dygraph(self, parameter_list):
419 420
        optimizer = paddle.optimizer.SGD(
            learning_rate=paddle.optimizer.lr.CosineAnnealingDecay(
421 422 423 424
                learning_rate=0.5, T_max=5
            ),
            parameters=parameter_list,
        )
M
MRXLT 已提交
425 426 427
        return optimizer

    def get_optimizer(self):
428 429
        optimizer = paddle.optimizer.SGD(
            learning_rate=paddle.optimizer.lr.CosineAnnealingDecay(
430 431 432
                learning_rate=0.5, T_max=5
            )
        )
M
MRXLT 已提交
433 434
        return optimizer

435
    def func_test_sgd(self):
M
MRXLT 已提交
436 437
        self._check_mlp()

438 439 440 441 442
    def test_sgd(self):
        with _test_eager_guard():
            self.func_test_sgd()
        self.func_test_sgd()

M
MRXLT 已提交
443 444 445

class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
446
        optimizer = paddle.optimizer.SGD(
447 448 449 450 451
            learning_rate=paddle.optimizer.lr.NoamDecay(
                d_model=0.01, warmup_steps=100, verbose=True
            ),
            parameters=parameter_list,
        )
452 453 454 455
        return optimizer

    def get_optimizer(self):
        optimizer = paddle.optimizer.SGD(
456 457 458 459
            learning_rate=paddle.optimizer.lr.NoamDecay(
                d_model=0.01, warmup_steps=100
            )
        )
460 461
        return optimizer

462
    def func_test_sgd(self):
463 464
        self._check_mlp()

465 466 467 468 469
    def test_sgd(self):
        with _test_eager_guard():
            self.func_test_sgd()
        self.func_test_sgd()

470 471 472 473 474

class TestImperativeOptimizerLambdaDecay(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
        optimizer = paddle.optimizer.SGD(
            learning_rate=paddle.optimizer.lr.LambdaDecay(
475 476 477 478
                learning_rate=0.5, lr_lambda=lambda epoch: 0.9**epoch
            ),
            parameters=parameter_list,
        )
479 480 481 482 483
        return optimizer

    def get_optimizer(self):
        optimizer = paddle.optimizer.SGD(
            learning_rate=paddle.optimizer.lr.LambdaDecay(
484 485 486
                learning_rate=0.5, lr_lambda=lambda epoch: 0.9**epoch
            )
        )
487 488
        return optimizer

489
    def func_test_sgd(self):
490 491
        self._check_mlp()

492 493 494 495 496
    def test_sgd(self):
        with _test_eager_guard():
            self.func_test_sgd()
        self.func_test_sgd()

497 498 499 500

class TestImperativeOptimizerLinearWarmup(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
        optimizer = paddle.optimizer.SGD(
501 502 503 504 505
            learning_rate=paddle.optimizer.lr.LinearWarmup(
                learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5
            ),
            parameters=parameter_list,
        )
506 507 508 509
        return optimizer

    def get_optimizer(self):
        optimizer = paddle.optimizer.SGD(
510 511 512 513 514 515 516 517
            learning_rate=paddle.optimizer.lr.LinearWarmup(
                learning_rate=0.5,
                warmup_steps=20,
                start_lr=0,
                end_lr=0.5,
                verbose=True,
            )
        )
518 519
        return optimizer

520
    def func_test_sgd(self):
521 522
        self._check_mlp()

523 524 525 526 527
    def test_sgd(self):
        with _test_eager_guard():
            self.func_test_sgd()
        self.func_test_sgd()

528 529 530 531 532

class TestImperativeOptimizerMultiStepDecay(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
        optimizer = paddle.optimizer.SGD(
            learning_rate=paddle.optimizer.lr.MultiStepDecay(
533 534 535 536
                learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8
            ),
            parameters=parameter_list,
        )
537 538 539 540 541
        return optimizer

    def get_optimizer(self):
        optimizer = paddle.optimizer.SGD(
            learning_rate=paddle.optimizer.lr.MultiStepDecay(
542 543 544
                learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8
            )
        )
545 546
        return optimizer

547
    def func_test_sgd(self):
548 549
        self._check_mlp()

550 551 552 553 554
    def test_sgd(self):
        with _test_eager_guard():
            self.func_test_sgd()
        self.func_test_sgd()

555 556 557 558

class TestImperativeOptimizerStepLR(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
        optimizer = paddle.optimizer.SGD(
559 560 561 562 563
            learning_rate=paddle.optimizer.lr.StepDecay(
                learning_rate=0.5, step_size=5, gamma=0.8
            ),
            parameters=parameter_list,
        )
564 565 566 567 568
        return optimizer

    def get_optimizer(self):
        optimizer = paddle.optimizer.SGD(
            learning_rate=paddle.optimizer.lr.StepDecay(
569 570 571
                learning_rate=0.5, step_size=5, gamma=0.8
            )
        )
572 573
        return optimizer

574
    def func_test_sgd(self):
575 576
        self._check_mlp()

577 578 579 580 581
    def test_sgd(self):
        with _test_eager_guard():
            self.func_test_sgd()
        self.func_test_sgd()

582 583 584 585 586

class TestImperativeOptimizerReduceOnPlateau(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
        optimizer = paddle.optimizer.SGD(
            learning_rate=paddle.optimizer.lr.ReduceOnPlateau(
587 588 589 590
                learning_rate=0.5
            ),
            parameters=parameter_list,
        )
M
MRXLT 已提交
591 592 593
        return optimizer

    def get_optimizer(self):
594
        optimizer = paddle.optimizer.SGD(
595 596
            learning_rate=paddle.optimizer.lr.ReduceOnPlateau(learning_rate=0.5)
        )
M
MRXLT 已提交
597 598
        return optimizer

599
    def func_test_sgd(self):
M
MRXLT 已提交
600 601
        self._check_mlp()

602 603 604 605 606
    def test_sgd(self):
        with _test_eager_guard():
            self.func_test_sgd()
        self.func_test_sgd()

M
MRXLT 已提交
607 608

class TestOptimizerLearningRate(unittest.TestCase):
609
    def func_test_constant_lr(self):
M
MRXLT 已提交
610 611 612
        with fluid.dygraph.guard():
            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")

613
            linear = paddle.nn.Linear(10, 10)
M
MRXLT 已提交
614 615 616 617 618 619 620 621 622

            a = fluid.dygraph.to_variable(a)

            b = linear(a)

            loss = fluid.layers.reduce_mean(b)

            adam = paddle.optimizer.Adam(0.001, parameters=linear.parameters())

623 624 625
            np.testing.assert_allclose(
                adam.get_lr(), 0.001, rtol=1e-06, atol=0.0
            )
M
MRXLT 已提交
626 627 628 629 630

            for i in range(10):
                adam.minimize(loss)
                lr = adam.get_lr()

631
                np.testing.assert_allclose(lr, 0.001, rtol=1e-06, atol=0.0)
M
MRXLT 已提交
632

633 634 635 636 637 638
    def test_constant_lr(self):
        with _test_eager_guard():
            self.func_test_constant_lr()
        self.func_test_constant_lr()

    def func_test_lr_decay(self):
M
MRXLT 已提交
639 640 641
        with fluid.dygraph.guard():
            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")

642
            linear = paddle.nn.Linear(10, 10)
M
MRXLT 已提交
643 644 645 646 647 648 649 650 651 652

            a = fluid.dygraph.to_variable(a)

            b = linear(a)

            loss = fluid.layers.reduce_mean(b)

            bd = [2, 4, 6, 8]
            value = [0.2, 0.4, 0.6, 0.8, 1.0]

653
            scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value)
654 655 656
            adam = paddle.optimizer.Adam(
                scheduler, parameters=linear.parameters()
            )
M
MRXLT 已提交
657

658
            np.testing.assert_allclose(adam.get_lr(), 0.2, rtol=1e-06, atol=0.0)
M
MRXLT 已提交
659 660 661 662 663

            ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0]
            for i in range(12):
                adam.minimize(loss)
                lr = adam.get_lr()
664
                np.testing.assert_allclose(lr, ret[i], rtol=1e-06, atol=0.0)
665
                scheduler.step()
M
MRXLT 已提交
666

667 668 669 670 671 672
    def test_lr_decay(self):
        with _test_eager_guard():
            self.func_test_lr_decay()
        self.func_test_lr_decay()

    def func_test_lr_scheduler_natural_exp(self):
M
MRXLT 已提交
673 674 675
        with fluid.dygraph.guard():
            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")

676
            linear = paddle.nn.Linear(10, 10)
M
MRXLT 已提交
677 678 679 680 681 682
            a = fluid.dygraph.to_variable(a)
            b = linear(a)

            loss = fluid.layers.reduce_mean(b)
            base_lr = 1.0

683
            scheduler = paddle.optimizer.lr.NaturalExpDecay(1.0, gamma=0.5)
684 685 686
            adam = paddle.optimizer.Adam(
                scheduler, parameters=linear.parameters()
            )
M
MRXLT 已提交
687

688
            np.testing.assert_allclose(adam.get_lr(), 1.0, rtol=1e-06, atol=0.0)
M
MRXLT 已提交
689

690 691
            ret = [1.0, np.exp(-0.5), np.exp(-1)]
            for i in range(3):
M
MRXLT 已提交
692 693
                adam.minimize(loss)
                lr = adam.get_lr()
694
                np.testing.assert_allclose(lr, ret[i], rtol=1e-06, atol=0.0)
695
                scheduler.step()
M
MRXLT 已提交
696

697 698 699 700 701 702
    def test_lr_scheduler_natural_exp(self):
        with _test_eager_guard():
            self.func_test_lr_scheduler_natural_exp()
        self.func_test_lr_scheduler_natural_exp()

    def func_test_set_lr(self):
M
MRXLT 已提交
703 704 705
        with fluid.dygraph.guard():
            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")

706
            linear = paddle.nn.Linear(10, 10)
M
MRXLT 已提交
707 708 709 710 711 712 713 714 715 716 717 718 719 720

            a = fluid.dygraph.to_variable(a)

            b = linear(a)

            loss = fluid.layers.reduce_mean(b)

            adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())

            lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
            for i in range(5):
                adam.set_lr(lr_list[i])
                adam.minimize(loss)
                lr = adam.get_lr()
721
                np.testing.assert_allclose(lr, lr_list[i], rtol=1e-06, atol=0.0)
M
MRXLT 已提交
722

723
            with self.assertRaises(TypeError):
724 725 726
                lr_var = fluid.layers.create_global_var(
                    shape=[1], value=0.7, dtype='float32'
                )
727
                adam.set_lr(lr_var)
M
MRXLT 已提交
728 729 730

            with self.assertRaises(RuntimeError):
                adam = paddle.optimizer.Adam(
731 732 733 734 735
                    paddle.optimizer.lr.NaturalExpDecay(
                        learning_rate=0.1, gamma=0.5
                    ),
                    parameters=linear.parameters(),
                )
M
MRXLT 已提交
736 737
                adam.set_lr(0.01)

738 739 740 741 742
    def test_set_lr(self):
        with _test_eager_guard():
            self.func_test_set_lr()
        self.func_test_set_lr()

M
MRXLT 已提交
743 744 745

class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
746 747 748
        optimizer = MomentumOptimizer(
            learning_rate=0.001, momentum=0.9, parameter_list=parameter_list
        )
M
MRXLT 已提交
749 750 751 752 753 754
        return optimizer

    def get_optimizer(self):
        optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
        return optimizer

755
    def func_test_momentum(self):
M
MRXLT 已提交
756 757
        self._check_mlp()

758 759 760 761 762
    def test_momentum(self):
        with _test_eager_guard():
            self.func_test_momentum()
        self.func_test_momentum()

M
MRXLT 已提交
763 764 765

class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
766 767 768
        optimizer = LarsMomentumOptimizer(
            learning_rate=0.001, momentum=0.9, parameter_list=parameter_list
        )
M
MRXLT 已提交
769 770 771 772 773 774
        return optimizer

    def get_optimizer(self):
        optimizer = LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9)
        return optimizer

775
    def func_test_larsmomentum(self):
M
MRXLT 已提交
776 777
        self._check_mlp()

778 779 780 781 782
    def test_larsmomentum(self):
        with _test_eager_guard():
            self.func_test_larsmomentum()
        self.func_test_larsmomentum()

M
MRXLT 已提交
783 784 785

class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
786 787 788
        optimizer = AdagradOptimizer(
            learning_rate=0.2, parameter_list=parameter_list
        )
M
MRXLT 已提交
789 790 791 792 793 794
        return optimizer

    def get_optimizer(self):
        optimizer = AdagradOptimizer(learning_rate=0.2)
        return optimizer

795
    def func_test_adagrad(self):
M
MRXLT 已提交
796 797
        self._check_mlp()

798 799 800 801 802
    def test_adagrad(self):
        with _test_eager_guard():
            self.func_test_adagrad()
        self.func_test_adagrad()

M
MRXLT 已提交
803 804 805

class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
806 807 808
        optimizer = AdamaxOptimizer(
            learning_rate=0.2, parameter_list=parameter_list
        )
M
MRXLT 已提交
809 810 811 812 813 814
        return optimizer

    def get_optimizer(self):
        optimizer = AdamaxOptimizer(learning_rate=0.2)
        return optimizer

815
    def func_test_adamax(self):
M
MRXLT 已提交
816 817
        self._check_mlp()

818 819 820 821 822
    def test_adamax(self):
        with _test_eager_guard():
            self.func_test_adamax()
        self.func_test_adamax()

M
MRXLT 已提交
823 824 825

class TestImperativeDpsgdOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
826 827 828 829 830 831 832
        optimizer = DpsgdOptimizer(
            learning_rate=0.01,
            clip=10.0,
            batch_size=16.0,
            sigma=1.0,
            parameter_list=parameter_list,
        )
M
MRXLT 已提交
833 834 835 836
        optimizer._seed = 100
        return optimizer

    def get_optimizer(self):
837 838 839
        optimizer = DpsgdOptimizer(
            learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0
        )
M
MRXLT 已提交
840 841 842
        optimizer._seed = 100
        return optimizer

843
    def func_test_dpsgd(self):
M
MRXLT 已提交
844 845
        self._check_mlp(place=fluid.CPUPlace())

846 847 848 849 850
    def test_dpsgd(self):
        with _test_eager_guard():
            self.func_test_dpsgd()
        self.func_test_dpsgd()

M
MRXLT 已提交
851 852 853

class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
854 855 856
        optimizer = DecayedAdagradOptimizer(
            learning_rate=0.2, parameter_list=parameter_list
        )
M
MRXLT 已提交
857 858 859 860 861 862
        return optimizer

    def get_optimizer(self):
        optimizer = DecayedAdagradOptimizer(learning_rate=0.2)
        return optimizer

863
    def func_test_decayadagrad(self):
M
MRXLT 已提交
864 865
        self._check_mlp()

866 867 868 869 870
    def test_decayadagrad(self):
        with _test_eager_guard():
            self.func_test_decayadagrad()
        self.func_test_decayadagrad()

M
MRXLT 已提交
871 872 873

class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
874 875 876 877 878 879
        optimizer = AdadeltaOptimizer(
            learning_rate=0.0003,
            epsilon=1.0e-6,
            rho=0.95,
            parameter_list=parameter_list,
        )
M
MRXLT 已提交
880 881 882
        return optimizer

    def get_optimizer(self):
883 884 885
        optimizer = AdadeltaOptimizer(
            learning_rate=0.0003, epsilon=1.0e-6, rho=0.95
        )
M
MRXLT 已提交
886 887
        return optimizer

888
    def func_test_adadelta(self):
M
MRXLT 已提交
889 890
        self._check_mlp()

891 892 893 894 895
    def test_adadelta(self):
        with _test_eager_guard():
            self.func_test_adadelta()
        self.func_test_adadelta()

M
MRXLT 已提交
896 897 898

class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
899 900 901
        optimizer = RMSPropOptimizer(
            learning_rate=0.1, parameter_list=parameter_list
        )
M
MRXLT 已提交
902 903 904 905 906 907
        return optimizer

    def get_optimizer(self):
        optimizer = RMSPropOptimizer(learning_rate=0.1)
        return optimizer

908
    def func_test_rmsprop(self):
M
MRXLT 已提交
909 910
        self._check_mlp()

911 912 913 914 915
    def test_rmsprop(self):
        with _test_eager_guard():
            self.func_test_rmsprop()
        self.func_test_rmsprop()

M
MRXLT 已提交
916 917 918

class TestImperativeFtrlOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
919 920 921
        optimizer = FtrlOptimizer(
            learning_rate=0.1, parameter_list=parameter_list
        )
M
MRXLT 已提交
922 923 924 925 926 927
        return optimizer

    def get_optimizer(self):
        optimizer = FtrlOptimizer(learning_rate=0.1)
        return optimizer

928
    def func_test_ftrl(self):
M
MRXLT 已提交
929 930
        self._check_mlp()

931 932 933 934 935
    def test_ftrl(self):
        with _test_eager_guard():
            self.func_test_ftrl()
        self.func_test_ftrl()

M
MRXLT 已提交
936 937 938 939 940 941 942

def exclude_fn(param):
    return param.name.endswith('.b_0')


class TestImperativeLambOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
943
        optimizer = paddle.optimizer.Lamb(
M
MRXLT 已提交
944 945
            learning_rate=0.002,
            exclude_from_weight_decay_fn=exclude_fn,
946 947
            parameters=parameter_list,
        )
M
MRXLT 已提交
948 949 950
        return optimizer

    def get_optimizer(self):
951
        optimizer = paddle.optimizer.Lamb(
952 953
            learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn
        )
M
MRXLT 已提交
954 955
        return optimizer

956 957
    # should fix: may fail in CI-windows
    def _test_lamb(self):
M
MRXLT 已提交
958 959 960 961 962
        self._check_mlp()


class TestImperativeModelAverage(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
963 964 965
        optimizer = ModelAverage(
            0.15, min_average_window=10000, max_average_window=12500
        )
M
MRXLT 已提交
966 967
        return optimizer

968
    def func_test_modelaverage(self):
M
MRXLT 已提交
969 970 971
        exception_message = "In dygraph, don't support ModelAverage."
        self._check_exception(exception_message)

972 973 974 975 976
    def test_modelaverage(self):
        with _test_eager_guard():
            self.func_test_modelaverage()
        self.func_test_modelaverage()

M
MRXLT 已提交
977 978 979

class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
980 981 982 983 984 985 986
        optimizer = DGCMomentumOptimizer(
            learning_rate=0.0001,
            momentum=0.9,
            rampup_step=1000,
            rampup_begin_step=1252,
            sparsity=[0.999, 0.999],
        )
M
MRXLT 已提交
987 988
        return optimizer

989
    def func_test_dgcmomentum(self):
M
MRXLT 已提交
990 991 992
        exception_message = "In dygraph, don't support DGCMomentumOptimizer."
        self._check_exception(exception_message)

993 994 995 996 997
    def test_dgcmomentum(self):
        with _test_eager_guard():
            self.func_test_dgcmomentum()
        self.func_test_dgcmomentum()

M
MRXLT 已提交
998 999 1000 1001 1002 1003

class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
        optimizer = ExponentialMovingAverage(0.999)
        return optimizer

1004
    def func_test_exponentialmoving(self):
1005 1006 1007
        exception_message = (
            "In dygraph, don't support ExponentialMovingAverage."
        )
M
MRXLT 已提交
1008 1009
        self._check_exception(exception_message)

1010 1011 1012 1013 1014
    def test_exponentialmoving(self):
        with _test_eager_guard():
            self.func_test_exponentialmoving()
        self.func_test_exponentialmoving()

M
MRXLT 已提交
1015 1016 1017

class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
1018 1019 1020
        optimizer = paddle.optimizer.SGD(
            learning_rate=0.5, parameters=parameter_list
        )
M
MRXLT 已提交
1021 1022 1023
        optimizer = PipelineOptimizer(optimizer)
        return optimizer

1024
    def func_test_pipline(self):
M
MRXLT 已提交
1025 1026 1027
        exception_message = "In dygraph, don't support PipelineOptimizer."
        self._check_exception(exception_message)

1028 1029 1030 1031 1032
    def test_pipline(self):
        with _test_eager_guard():
            self.func_test_pipline()
        self.func_test_pipline()

M
MRXLT 已提交
1033 1034 1035

class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
1036 1037 1038
        optimizer = paddle.optimizer.SGD(
            learning_rate=0.5, parameters=parameter_list
        )
M
MRXLT 已提交
1039 1040 1041
        optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5)
        return optimizer

1042
    def func_test_lookahead(self):
M
MRXLT 已提交
1043 1044 1045
        exception_message = "In dygraph, don't support LookaheadOptimizer."
        self._check_exception(exception_message)

1046 1047 1048 1049 1050
    def test_lookahead(self):
        with _test_eager_guard():
            self.func_test_lookahead()
        self.func_test_lookahead()

M
MRXLT 已提交
1051 1052 1053

class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
1054 1055 1056
        optimizer = paddle.optimizer.SGD(
            learning_rate=0.5, parameters=parameter_list
        )
M
MRXLT 已提交
1057 1058 1059
        optimizer = RecomputeOptimizer(optimizer)
        return optimizer

1060
    def func_test_recompute(self):
M
MRXLT 已提交
1061 1062 1063
        exception_message = "In dygraph, don't support RecomputeOptimizer."
        self._check_exception(exception_message)

1064 1065 1066 1067 1068
    def test_recompute(self):
        with _test_eager_guard():
            self.func_test_recompute()
        self.func_test_recompute()

M
MRXLT 已提交
1069 1070

class TestImperativeOptimizerList(unittest.TestCase):
1071
    def func_test_parameter_list(self):
M
MRXLT 已提交
1072
        with fluid.dygraph.guard():
1073 1074
            linear_1 = paddle.nn.Linear(10, 10)
            linear_2 = paddle.nn.Linear(10, 10)
M
MRXLT 已提交
1075

1076 1077 1078 1079 1080 1081
            sgd = paddle.optimizer.SGD(
                1.0,
                parameters=itertools.chain(
                    linear_1.parameters(), linear_2.parameters()
                ),
            )
M
MRXLT 已提交
1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092

            in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
            in_data = fluid.dygraph.to_variable(in_np)

            y = linear_1(in_data)
            y = linear_2(y)
            loss = fluid.layers.reduce_mean(y)
            loss.backward()
            sgd.minimize(loss)

            self.assertTrue(
1093 1094 1095
                len(sgd._parameter_list)
                == len(linear_1.parameters() + linear_2.parameters())
            )
M
MRXLT 已提交
1096

1097 1098 1099 1100 1101
    def test_parameter_list(self):
        with _test_eager_guard():
            self.func_test_parameter_list()
        self.func_test_parameter_list()

M
MRXLT 已提交
1102 1103 1104

if __name__ == '__main__':
    unittest.main()