test_adam_op.py 44.3 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2
#
D
dzhwinter 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
D
dzhwinter 已提交
9 10 11 12 13 14
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15 16
from __future__ import print_function

17 18
import unittest
import numpy as np
19
from op_test import OpTest
20 21
from paddle.fluid import core
from paddle.fluid.op import Operator
22
import paddle.fluid as fluid
M
MRXLT 已提交
23
import paddle
C
chentianyu03 已提交
24
from paddle.fluid.framework import _test_eager_guard
25 26 27 28


class TestAdamOp1(OpTest):
    def setUp(self):
29
        '''Test Adam Op with supplied attributes'''
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")

        learning_rate = 0.004
        beta1 = 0.78
        beta2 = 0.836
        epsilon = 1e-4
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
51
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
52 53 54 55
        }

        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}

56
        param_out, moment1_out, moment2_out = adam_step(self.inputs, self.attrs)
57 58 59 60

        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
A
Aurelius84 已提交
61 62
            'ParamOut': param_out,
            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
63
            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
64 65 66 67 68 69 70
        }

    def test_check_output(self):
        self.check_output()


class TestAdamOp2(OpTest):
71 72 73
    def set_shape(self):
        self.shape = (102, 105)

74
    def setUp(self):
75
        '''Test Adam Op with supplied attributes'''
76
        self.op_type = "adam"
77 78 79 80
        self.set_shape()
        param = np.random.uniform(-1, 1, self.shape).astype("float32")
        grad = np.random.uniform(-1, 1, self.shape).astype("float32")
        moment1 = np.random.uniform(-1, 1, self.shape).astype("float32")
81
        # The second moment is positive
82
        moment2 = np.random.random(self.shape).astype("float32")
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97

        learning_rate = 0.001
        beta1 = 0.9
        beta2 = 0.999
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
98
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
99 100 101 102
        }

        attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}

103
        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
104 105 106 107

        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
A
Aurelius84 已提交
108 109
            'ParamOut': param_out,
            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
110
            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
111 112 113 114 115 116
        }

    def test_check_output(self):
        self.check_output()


117 118
class TestAdamOnlyTailOp(TestAdamOp2):
    def set_shape(self):
119
        self.shape = 3
120 121


122 123
class TestAdamOpMultipleSteps(OpTest):
    def setUp(self):
124
        '''Test Adam Operator with supplied attributes'''
125 126 127 128 129 130 131 132 133 134
        self.op_type = "adam"
        self.num_steps = 10

        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")

        learning_rate = 0.001
A
Aurelius84 已提交
135 136
        self.beta1 = 0.9
        self.beta2 = 0.999
137
        epsilon = 1e-8
A
Aurelius84 已提交
138 139
        self.beta1_pow = self.beta1**10
        self.beta2_pow = self.beta2**10
140 141 142 143 144 145 146

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
A
Aurelius84 已提交
147
            'Beta1Pow': np.array([self.beta1_pow]).astype("float32"),
148
            'Beta2Pow': np.array([self.beta2_pow]).astype("float32"),
149 150
        }

A
Aurelius84 已提交
151 152 153
        self.attrs = {
            'epsilon': epsilon,
            'beta1': self.beta1,
154
            'beta2': self.beta2,
A
Aurelius84 已提交
155
        }
156 157 158

    def test_check_output(self):
        for _ in range(self.num_steps):
159 160 161
            param_out, moment1_out, moment2_out = adam_step(
                self.inputs, self.attrs
            )
162

A
Aurelius84 已提交
163 164
            beta1_pow_out = self.inputs['Beta1Pow'] * self.beta1
            beta2_pow_out = self.inputs['Beta2Pow'] * self.beta2
165 166 167
            self.outputs = {
                'Moment1Out': moment1_out,
                'Moment2Out': moment2_out,
A
Aurelius84 已提交
168 169
                'ParamOut': param_out,
                'Beta1PowOut': beta1_pow_out,
170
                'Beta2PowOut': beta2_pow_out,
171 172 173 174 175 176 177 178 179
            }

            # Verify output for this step
            self.check_output()

            # Output of this step becomes input for next step
            self.inputs['Param'] = param_out
            self.inputs['Moment1'] = moment1_out
            self.inputs['Moment2'] = moment2_out
180 181

            # Update powers of Beta1 and Beta2 for next time step
A
Aurelius84 已提交
182 183
            self.inputs['Beta1Pow'] = beta1_pow_out
            self.inputs['Beta2Pow'] = beta2_pow_out
184 185

            # Randomize gradient for next step
186 187 188
            self.inputs['Grad'] = np.random.uniform(-1, 1, (102, 105)).astype(
                "float32"
            )
189

C
chentianyu03 已提交
190 191 192 193
    def test_api_eager_dygraph(self):
        with _test_eager_guard():
            self.test_check_output()

194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212

def adam_step(inputs, attributes):
    '''
    Simulate one step of the adam optimizer
    :param inputs: dict of inputs
    :param attributes: dict of attributes
    :return tuple: tuple of output param, moment1, moment2,
    beta1 power accumulator and beta2 power accumulator
    '''
    param = inputs['Param']
    grad = inputs['Grad']
    moment1 = inputs['Moment1']
    moment2 = inputs['Moment2']
    lr = inputs['LearningRate']
    beta1_pow = inputs['Beta1Pow']
    beta2_pow = inputs['Beta2Pow']

    epsilon = attributes['epsilon']

213 214 215 216 217 218 219 220 221
    if 'beta1' in attributes:
        beta1 = attributes['beta1']
    else:
        beta1 = inputs['Beta1Tensor'][0]
    if 'beta2' in attributes:
        beta2 = attributes['beta2']
    else:
        beta2 = inputs['Beta2Tensor'][0]

222 223
    moment1_out = beta1 * moment1 + (1 - beta1) * grad
    moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
224
    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
225
    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))
226
    return param_out, moment1_out, moment2_out
227 228


R
Roc 已提交
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
def adamw_step(inputs, attributes):
    '''
    Simulate one step of the adam optimizer
    :param inputs: dict of inputs
    :param attributes: dict of attributes
    :return tuple: tuple of output param, moment1, moment2,
    beta1 power accumulator and beta2 power accumulator
    '''
    param = inputs['Param']
    grad = inputs['Grad']
    moment1 = inputs['Moment1']
    moment2 = inputs['Moment2']
    lr = inputs['LearningRate']
    beta1_pow = inputs['Beta1Pow']
    beta2_pow = inputs['Beta2Pow']

    epsilon = attributes['epsilon']
    coeff = attributes["coeff"]
    if attributes.get("with_decay", False):
        decay = 1.0 - lr * coeff
        param2 = param * decay
        param = param2.copy()
    if 'beta1' in attributes:
        beta1 = attributes['beta1']
    else:
        beta1 = inputs['Beta1Tensor'][0]
    if 'beta2' in attributes:
        beta2 = attributes['beta2']
    else:
        beta2 = inputs['Beta2Tensor'][0]

    moment1_out = beta1 * moment1 + (1 - beta1) * grad
    moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))

    return param_out, moment1_out, moment2_out


268 269 270
def adam_step_sparse(
    inputs, attributes, height, rows, row_numel, np_grad, lazy_mode
):
T
wip  
typhoonzero 已提交
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
    '''
    Simulate one step of the adam optimizer
    :param inputs: dict of inputs
    :param attributes: dict of attributes
    :return tuple: tuple of output param, moment1, moment2,
    beta1 power accumulator and beta2 power accumulator
    '''
    param = inputs['Param']
    # grad = inputs['Grad']
    moment1 = inputs['Moment1']
    moment2 = inputs['Moment2']
    lr = inputs['LearningRate']
    beta1_pow = inputs['Beta1Pow']
    beta2_pow = inputs['Beta2Pow']

    beta1 = attributes['beta1']
    beta2 = attributes['beta2']
    epsilon = attributes['epsilon']

T
typhoonzero 已提交
290 291 292
    moment1_out = np.zeros(shape=[height, row_numel])
    moment2_out = np.zeros(shape=[height, row_numel])
    param_out = np.zeros(shape=[height, row_numel])
T
wip  
typhoonzero 已提交
293

Q
Qiao Longfei 已提交
294
    def update_row(row_id, update_value):
295 296 297 298 299 300
        moment1_out[row_id] = (
            beta1 * moment1[row_id] + (1 - beta1) * update_value
        )
        moment2_out[row_id] = beta2 * moment2[row_id] + (1 - beta2) * np.square(
            update_value
        )
T
wip  
typhoonzero 已提交
301
        lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
302
        param_out[row_id] = param[row_id] - lr_t * (
303 304
            moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon)
        )
Q
Qiao Longfei 已提交
305 306 307 308 309 310 311 312 313 314 315

    if lazy_mode:
        for idx, row_id in enumerate(rows):
            update_row(row_id, np_grad[idx])
    else:
        for row_id in range(param_out.shape[0]):
            update_value = np.zeros(np_grad[0].shape).astype("float32")
            if row_id in rows:
                update_value = np_grad[rows.index(row_id)]
            update_row(row_id, update_value)

T
wip  
typhoonzero 已提交
316 317 318 319
    return param_out, moment1_out, moment2_out


class TestSparseAdamOp(unittest.TestCase):
Q
Qiao Longfei 已提交
320
    def setup(self, scope, place, lazy_mode):
T
wip  
typhoonzero 已提交
321 322 323
        beta1 = 0.78
        beta2 = 0.836
        epsilon = 1e-4
A
Aurelius84 已提交
324 325
        beta1_pow = np.array([beta1**10]).astype("float32")
        beta2_pow = np.array([beta2**10]).astype("float32")
T
wip  
typhoonzero 已提交
326 327 328

        height = 10
        rows = [0, 4, 7]
T
typhoonzero 已提交
329
        self.rows = rows
T
wip  
typhoonzero 已提交
330
        row_numel = 12
T
typhoonzero 已提交
331
        self.row_numel = row_numel
T
wip  
typhoonzero 已提交
332
        self.dense_inputs = {
Q
Qiao Longfei 已提交
333 334 335
            "Param": np.full((height, row_numel), 5.0).astype("float32"),
            "Moment1": np.full((height, row_numel), 5.0).astype("float32"),
            "Moment2": np.full((height, row_numel), 5.0).astype("float32"),
A
Aurelius84 已提交
336 337
            'Beta1Pow': beta1_pow,
            'Beta2Pow': beta2_pow,
338
            "LearningRate": np.full((1), 2.0).astype("float32"),
T
wip  
typhoonzero 已提交
339
        }
Q
Qiao Longfei 已提交
340
        self.init_output = np.full((height, row_numel), 0.0).astype("float32")
341 342 343 344
        self.attrs = {
            'epsilon': epsilon,
            'beta1': beta1,
            'beta2': beta2,
345
            'min_row_size_to_use_multithread': 2,
346
        }
T
wip  
typhoonzero 已提交
347 348 349 350 351 352 353 354 355 356 357 358 359

        grad_selected_rows = scope.var('Grad').get_selected_rows()
        grad_selected_rows.set_height(height)
        grad_selected_rows.set_rows(rows)
        np_array = np.ones((len(rows), row_numel)).astype("float32")
        np_array[0, 0] = 2.0
        np_array[2, 8] = 4.0

        grad_tensor = grad_selected_rows.get_tensor()
        grad_tensor.set(np_array, place)

        self.sparse_inputs = ["Grad"]

360 361 362 363 364 365 366 367 368
        param_out, mom1, mom2 = adam_step_sparse(
            self.dense_inputs,
            self.attrs,
            height,
            rows,
            row_numel,
            np_array,
            lazy_mode,
        )
T
wip  
typhoonzero 已提交
369
        self.outputs = {
T
typhoonzero 已提交
370
            "ParamOut": param_out,
T
wip  
typhoonzero 已提交
371
            "Moment1Out": mom1,
A
Aurelius84 已提交
372 373
            "Moment2Out": mom2,
            'Beta1PowOut': beta1_pow * beta1,
374
            'Beta2PowOut': beta2_pow * beta2,
T
wip  
typhoonzero 已提交
375 376
        }

Q
Qiao Longfei 已提交
377
    def check_with_place(self, place, lazy_mode):
T
wip  
typhoonzero 已提交
378
        scope = core.Scope()
Q
Qiao Longfei 已提交
379
        self.setup(scope, place, lazy_mode)
T
wip  
typhoonzero 已提交
380 381

        op_args = dict()
Q
Qiao Longfei 已提交
382
        op_args['lazy_mode'] = lazy_mode
383
        for key, np_array in self.dense_inputs.items():
T
wip  
typhoonzero 已提交
384 385 386 387 388
            var = scope.var(key).get_tensor()
            var.set(np_array, place)
            op_args[key] = key
        for s in self.sparse_inputs:
            op_args[s] = s
T
typhoonzero 已提交
389 390
        for s in self.outputs:
            var = scope.var(s).get_tensor()
Q
Qiao Longfei 已提交
391
            var.set(self.init_output, place)
T
typhoonzero 已提交
392
            op_args[s] = s
T
wip  
typhoonzero 已提交
393 394 395 396
        for k in self.attrs:
            op_args[k] = self.attrs[k]

        # create and run sgd operator
T
typhoonzero 已提交
397 398
        adam_op = Operator("adam", **op_args)
        adam_op.run(scope, place)
T
wip  
typhoonzero 已提交
399

400
        for key, np_array in self.outputs.items():
T
wip  
typhoonzero 已提交
401 402
            out_var = scope.var(key).get_tensor()
            actual = np.array(out_var)
T
typhoonzero 已提交
403 404
            actual = actual.reshape([actual.size])
            np_array = np_array.reshape([np_array.size])
Q
Qiao Longfei 已提交
405 406 407

            for i in range(np_array.size):
                self.assertLess((actual[i] - np_array[i]), 0.00001)
T
wip  
typhoonzero 已提交
408

Q
Qiao Longfei 已提交
409
    def test_sparse_adam(self):
T
wip  
typhoonzero 已提交
410
        places = [core.CPUPlace()]
411
        if core.is_compiled_with_cuda():
T
wip  
typhoonzero 已提交
412 413
            places.append(core.CUDAPlace(0))
        for place in places:
Q
Qiao Longfei 已提交
414 415
            for lazy_mode in (True, False):
                self.check_with_place(place, lazy_mode)
T
wip  
typhoonzero 已提交
416 417


418 419
class TestAdamOpBetaVariable(OpTest):
    def setUp(self):
420
        '''Test Adam Op with beta as Variable'''
421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")
        beta1 = 0.85
        beta2 = 0.95

        learning_rate = 0.001
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
            "Beta1Tensor": np.array([beta1]).astype("float32"),
            "Beta2Tensor": np.array([beta2]).astype("float32"),
        }

        attributes = {'epsilon': epsilon}

449
        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
450 451 452 453

        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
A
Aurelius84 已提交
454 455
            'ParamOut': param_out,
            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
456
            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
457 458 459 460 461 462
        }

    def test_check_output(self):
        self.check_output()


463 464
class TestAdamOpBetaEpsilonVariable(OpTest):
    def setUp(self):
465
        '''Test Adam Op with beta/epsilon as Variable'''
466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")
        beta1 = 0.85
        beta2 = 0.95

        learning_rate = 0.001
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
            "Beta1Tensor": np.array([beta1]).astype("float32"),
            "Beta2Tensor": np.array([beta2]).astype("float32"),
            "EpsilonTensor": np.array([epsilon]).astype("float32"),
        }

        attributes = {'epsilon': epsilon}

495
        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
496 497 498 499 500 501

        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
            'ParamOut': param_out,
            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
502
            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
503 504 505 506 507 508
        }

    def test_check_output(self):
        self.check_output()


509 510
class TestAdamOpWithGlobalBetaPow(OpTest):
    def setUp(self):
511
        '''Test Adam Op with global_beta_pow'''
512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")
        beta1 = 0.85
        beta2 = 0.95

        learning_rate = 0.001
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
            "Beta1Tensor": np.array([beta1]).astype("float32"),
            "Beta2Tensor": np.array([beta2]).astype("float32"),
            "EpsilonTensor": np.array([epsilon]).astype("float32"),
        }

        attributes = {'epsilon': epsilon}

541
        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
542 543 544 545 546 547 548 549 550

        self.attrs = {'use_global_beta_pow': True}

        # use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
            'ParamOut': param_out,
            'Beta1PowOut': np.array([]),
551
            'Beta2PowOut': np.array([]),
552 553 554 555 556 557
        }

    def test_check_output(self):
        self.check_output()


558 559
class TestAdamOpWithSkipUpdate(OpTest):
    def setUp(self):
560
        '''Test Adam Op with global_beta_pow'''
561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")
        beta1 = 0.85
        beta2 = 0.95

        learning_rate = 0.001
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
            "Beta1Tensor": np.array([beta1]).astype("float32"),
            "Beta2Tensor": np.array([beta2]).astype("float32"),
            "EpsilonTensor": np.array([epsilon]).astype("float32"),
            "SkipUpdate": np.array([True]).astype("bool"),
        }

        attributes = {'epsilon': epsilon}

        self.attrs = {'use_global_beta_pow': True}

        # use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
        self.outputs = {
            'Moment1Out': moment1,
            'Moment2Out': moment2,
            'ParamOut': param,
            'Beta1PowOut': self.inputs['Beta1Pow'],
            'Beta2PowOut': self.inputs['Beta2Pow'],
        }

    def test_check_output(self):
        self.check_output()


M
MRXLT 已提交
606 607 608
class TestAdamOpV2(unittest.TestCase):
    def test_adam_op(self):
        place = fluid.CPUPlace()
609
        shape = [2, 3, 8, 8]
M
MRXLT 已提交
610 611 612 613 614 615 616 617 618
        exe = fluid.Executor(place)
        train_prog = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(train_prog, startup):
            with fluid.unique_name.guard():
                data = fluid.data(name="data", shape=shape)
                conv = fluid.layers.conv2d(data, 8, 3)
                loss = fluid.layers.reduce_mean(conv)

619 620 621 622 623 624
                beta1 = fluid.layers.create_global_var(
                    shape=[1], value=0.85, dtype='float32', persistable=True
                )
                beta2 = fluid.layers.create_global_var(
                    shape=[1], value=0.95, dtype='float32', persistable=True
                )
M
MRXLT 已提交
625
                betas = [beta1, beta2]
626 627 628 629 630 631 632
                opt = paddle.optimizer.Adam(
                    learning_rate=1e-5,
                    beta1=beta1,
                    beta2=beta2,
                    weight_decay=0.01,
                    epsilon=1e-8,
                )
M
MRXLT 已提交
633 634 635 636 637 638 639 640 641 642 643 644 645
                opt.minimize(loss)

        exe.run(startup)
        data_np = np.random.random(shape).astype('float32')
        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
        assert rets[0] is not None

    def test_adam_op_dygraph(self):
        paddle.disable_static()
        value = np.arange(26).reshape(2, 13).astype("float32")
        a = fluid.dygraph.to_variable(value)
        linear = fluid.Linear(13, 5, dtype="float32")

646 647 648
        adam = paddle.optimizer.Adam(
            learning_rate=0.01, parameters=linear.parameters()
        )
M
MRXLT 已提交
649 650 651 652
        out = linear(a)
        out.backward()
        adam.step()
        adam.clear_gradients()
653
        paddle.enable_static()
M
MRXLT 已提交
654 655 656 657

    def test_adam_op_with_state_dict(self):

        paddle.disable_static()
T
tangwei12 已提交
658
        emb = paddle.nn.Embedding(10, 10)
M
MRXLT 已提交
659 660 661 662 663

        adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
        state_dict = adam.state_dict()
        adam.set_state_dict(state_dict)

664
        # learning_rate is LRScheduler
665
        learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(
666 667
            learning_rate=0.1, T_max=10
        )
M
MRXLT 已提交
668 669 670
        adam = paddle.optimizer.Adam(
            learning_rate=learning_rate,
            weight_decay=fluid.regularizer.L2Decay(0.001),
671 672
            parameters=emb.parameters(),
        )
M
MRXLT 已提交
673 674 675 676
        lr = adam.get_lr()
        state_dict = adam.state_dict()
        adam.set_state_dict(state_dict)

677
        # leanrning_rate is Tensor
M
MRXLT 已提交
678 679 680
        with self.assertRaises(TypeError):
            learning_rate = np.array([0.01]).astype("float32")
            learning_rate = paddle.to_tensor(learning_rate)
681 682 683
            adam = paddle.optimizer.Adam(
                learning_rate=learning_rate, parameters=emb.parameters()
            )
M
MRXLT 已提交
684 685

        params = adam.get_opti_var_name_list()
686
        assert params is not None
687
        paddle.enable_static()
M
MRXLT 已提交
688 689 690 691 692 693 694

    def test_adam_with_grad_clip(self):
        paddle.disable_static()
        value = np.arange(26).reshape(2, 13).astype("float32")
        a = fluid.dygraph.to_variable(value)
        linear = fluid.Linear(13, 5, dtype="float32")
        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
695 696 697
        adam = paddle.optimizer.Adam(
            0.1, parameters=linear.parameters(), grad_clip=clip
        )
M
MRXLT 已提交
698 699 700 701
        out = linear(a)
        out.backward()
        adam.step()
        adam.clear_gradients()
702
        paddle.enable_static()
M
MRXLT 已提交
703 704 705 706 707 708 709 710 711

    def test_adam_op_with_set_lr(self):
        paddle.disable_static()
        linear = paddle.nn.Linear(10, 10)
        adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())

        lr = 0.01
        adam.set_lr(lr)
        cur_lr = adam.get_lr()
712
        assert lr == cur_lr
M
MRXLT 已提交
713
        with self.assertRaises(TypeError):
714 715 716
            lr_var = paddle.fluid.layers.create_global_var(
                shape=[1], value=lr, dtype='float32'
            )
717
            adam.set_lr(lr_var)
718
        paddle.enable_static()
719

M
MRXLT 已提交
720 721 722 723
    def test_adam_op_invalid_input(self):
        paddle.disable_static()
        linear = paddle.nn.Linear(10, 10)
        with self.assertRaises(ValueError):
724 725 726
            adam = paddle.optimizer.Adam(
                0.1, beta1=-1, parameters=linear.parameters()
            )
M
MRXLT 已提交
727
        with self.assertRaises(ValueError):
728 729 730
            adam = paddle.optimizer.Adam(
                0.1, beta2=-1, parameters=linear.parameters()
            )
M
MRXLT 已提交
731
        with self.assertRaises(ValueError):
732 733 734
            adam = paddle.optimizer.Adam(
                0.1, epsilon=-1, parameters=linear.parameters()
            )
735
        paddle.enable_static()
M
MRXLT 已提交
736

737 738 739 740 741 742
    def test_adam_op_with_sparse_input_and_weight_decay(self):

        paddle.disable_static()
        x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64)
        x = paddle.to_tensor(x_data, stop_gradient=False)
        emb = paddle.nn.Embedding(10, 10, sparse=True)
743 744 745
        adam = paddle.optimizer.Adam(
            0.001, parameters=emb.parameters(), weight_decay=0.01
        )
746 747 748 749 750

        with self.assertRaises(RuntimeError):
            out = emb(x)
            out.backward()
            adam.step()
751
        paddle.enable_static()
752

C
chentianyu03 已提交
753 754 755 756 757 758 759 760
    def test_api_eager_dygraph(self):
        with _test_eager_guard():
            self.test_adam_op_dygraph()
            self.test_adam_op_with_state_dict()
            self.test_adam_with_grad_clip()
            self.test_adam_op_with_set_lr()
            self.test_adam_op_with_sparse_input_and_weight_decay()

761

762
class TestAdamOptimizer(unittest.TestCase):
763 764 765 766 767 768 769 770
    def _test(
        self,
        place,
        use_tensor=True,
        use_fluid_api=True,
        use_global_beta_pow=False,
        flatten_param_grads=False,
    ):
771 772 773 774 775 776 777
        paddle.enable_static()
        main_prog = paddle.static.Program()
        startup_prog = paddle.static.Program()
        SEED = 2021
        paddle.seed(SEED)
        np.random.seed(SEED)

778 779 780 781 782 783
        a_np = np.random.random(size=(2, 2)).astype('float32')
        b_np = np.random.random(size=(2, 2)).astype('float32')
        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
        weight_attr1 = paddle.ParamAttr(
            name="weight1",
            initializer=fluid.initializer.Constant(value=1.0),
784 785
            trainable=True,
        )
786 787 788
        weight_attr2 = paddle.ParamAttr(
            name="weight2",
            initializer=fluid.initializer.Constant(value=2.0),
789 790
            trainable=True,
        )
791
        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
792 793

        with paddle.static.program_guard(main_prog, startup_prog):
794 795 796
            with paddle.utils.unique_name.guard():
                a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
                b = paddle.static.data(name="b", shape=[2, 2], dtype='float32')
797 798 799
                label = paddle.static.data(
                    name="label", shape=[2, 1], dtype='int64'
                )
800 801 802 803 804

                sum = paddle.add(a, b)
                z = paddle.pow(sum, 2.0)

                fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1)
805 806 807
                prediction = fluid.layers.fc(
                    input=fc_1, size=2, param_attr=weight_attr2, act='softmax'
                )
808 809 810 811 812 813 814 815 816 817 818 819

                cost = fluid.layers.cross_entropy(input=prediction, label=label)
                loss = fluid.layers.reduce_mean(cost)
                beta1_init = 0.9
                beta2_init = 0.999
                epsilon_init = 1e-8
                if use_tensor:
                    beta1 = fluid.layers.create_global_var(
                        shape=[1],
                        value=float(beta1_init),
                        dtype='float32',
                        persistable=True,
820 821
                        name="beta1",
                    )
822 823 824 825 826
                    beta2 = fluid.layers.create_global_var(
                        shape=[1],
                        value=float(beta2_init),
                        dtype='float32',
                        persistable=True,
827 828
                        name="beta2",
                    )
829 830 831 832 833
                    epsilon = fluid.layers.create_global_var(
                        shape=[1],
                        value=float(epsilon_init),
                        dtype='float32',
                        persistable=True,
834 835
                        name="epsilon",
                    )
836 837 838 839 840 841 842 843 844
                    if use_fluid_api:
                        adam = fluid.optimizer.Adam(
                            learning_rate=0.01,
                            beta1=beta1,
                            beta2=beta2,
                            epsilon=epsilon,
                            use_global_beta_pow=use_global_beta_pow,
                            flatten_param_grads=flatten_param_grads,
                            align_size=256,
845 846
                            grad_clip=clip,
                        )
847
                    else:
848 849 850 851 852 853 854
                        adam = paddle.optimizer.Adam(
                            learning_rate=0.01,
                            beta1=beta1,
                            beta2=beta2,
                            epsilon=epsilon,
                            grad_clip=clip,
                        )
855
                else:
856 857 858 859 860 861 862 863 864
                    if use_fluid_api:
                        adam = fluid.optimizer.Adam(
                            learning_rate=0.01,
                            beta1=beta1_init,
                            beta2=beta2_init,
                            epsilon=epsilon_init,
                            use_global_beta_pow=use_global_beta_pow,
                            flatten_param_grads=flatten_param_grads,
                            align_size=256,
865 866
                            grad_clip=clip,
                        )
867
                    else:
868 869 870 871 872 873 874
                        adam = fluid.optimizer.Adam(
                            learning_rate=0.01,
                            beta1=beta1_init,
                            beta2=beta2_init,
                            epsilon=epsilon_init,
                            grad_clip=clip,
                        )
875 876 877 878 879 880 881 882 883 884

                adam.minimize(loss)

        scope = fluid.Scope()
        with fluid.scope_guard(scope):
            exe = paddle.static.Executor(place)
            exe.run(startup_prog)

            print("Start run on {}".format(place))
            for epoch in range(10):
885 886 887 888 889 890 891 892 893 894
                pred_res, loss_res = exe.run(
                    main_prog,
                    feed={"a": a_np, "b": b_np, "label": label_np},
                    fetch_list=[prediction, loss],
                )
                print(
                    "Epoch {} | Prediction[0]: {}, Loss: {}".format(
                        epoch, pred_res[0], loss_res
                    )
                )
895 896
            paddle.disable_static()
            return pred_res, loss_res
897 898 899 900 901 902 903

    def _test_with_place(self, place):
        preds = []
        losses = []

        for use_tensor in [True, False]:
            for use_fluid_api in [True, False]:
904
                for use_global_beta_pow in [True, False]:
905
                    for flatten_param_grads in [True, False]:
906 907 908 909 910 911 912
                        pred, loss = self._test(
                            place,
                            use_tensor,
                            use_fluid_api,
                            use_global_beta_pow,
                            flatten_param_grads,
                        )
913 914
                        preds.append(pred)
                        losses.append(loss)
915
        for pred in preds:
916
            np.testing.assert_allclose(pred, preds[0], rtol=1e-05)
917
        for loss in losses:
918
            np.testing.assert_allclose(loss, losses[0], rtol=1e-05)
919 920 921 922 923 924 925

    def test_adam_api(self):
        # NOTE(zhiqiu): cpu and gpu has different seed, so should compare separatly.
        self._test_with_place(paddle.CPUPlace())
        if core.is_compiled_with_cuda():
            self._test_with_place(paddle.CUDAPlace(0))

926 927 928 929 930 931 932 933
    def test_adam_flatten_param_grads_with_regularizer(self):
        # flatten_param_grads + regularizer is not supported yet.
        paddle.enable_static()
        main = fluid.Program()
        weight_attr = paddle.ParamAttr(
            name="weight1",
            initializer=fluid.initializer.Constant(value=1.0),
            regularizer=fluid.regularizer.L1DecayRegularizer(
934 935 936 937
                regularization_coeff=0.1
            ),
            trainable=True,
        )
938 939 940
        with fluid.program_guard(main):
            x = fluid.data(name='x', shape=[None, 13], dtype='float32')
            y = fluid.data(name='y', shape=[None, 1], dtype='float32')
941 942 943
            y_predict = fluid.layers.fc(
                input=x, size=1, act=None, param_attr=weight_attr
            )
944
            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
945
            avg_cost = paddle.mean(cost)
946

947 948 949
            adam = fluid.optimizer.AdamOptimizer(
                0.01, flatten_param_grads=True, align_size=256
            )
950 951 952 953 954
            adam.minimize(avg_cost)
            paddle.disable_static()

            self.assertEqual(adam._flatten_param_grads, False)

955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971
    def test_adam_exception(self):
        paddle.enable_static()
        a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
        b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
        label = paddle.static.data(name="label", shape=[32, 1], dtype='int64')

        sum = paddle.add(a, b)
        z = paddle.pow(sum, 2.0)

        fc_1 = fluid.layers.fc(input=z, size=128)
        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')

        cost = fluid.layers.cross_entropy(input=prediction, label=label)
        loss = fluid.layers.reduce_mean(cost)
        adam = fluid.optimizer.Adam(use_global_beta_pow=True)
        adam.minimize(loss)
        self.assertRaises(Exception, adam._get_global_accumulator, 'tmp')
972 973 974
        adam._add_global_accumulator(
            'tmp', type=core.VarDesc.VarType.LOD_TENSOR
        )
975
        adam._get_global_accumulator('tmp')
976 977 978 979 980 981
        self.assertRaises(
            Exception,
            adam._add_global_accumulator,
            adam._beta1_pow_acc_str,
            type=core.VarDesc.VarType.LOD_TENSOR,
        )
982 983 984 985 986 987 988 989 990 991
        paddle.disable_static()

    def test_adam_save_load(self):
        paddle.disable_static()
        a = paddle.rand([4, 10])
        linear = paddle.nn.Linear(10, 10)
        b = linear(a)
        state_dict = linear.state_dict()
        fluid.save_dygraph(state_dict, "paddle_dy")

992 993 994 995 996 997 998 999
        scheduler = paddle.optimizer.lr.NoamDecay(
            d_model=0.01, warmup_steps=100, verbose=True
        )
        adam = paddle.fluid.optimizer.Adam(
            learning_rate=scheduler,
            parameter_list=linear.parameters(),
            use_global_beta_pow=True,
        )
1000 1001 1002
        adam.minimize(b)
        state_dict = adam.state_dict()
        fluid.save_dygraph(state_dict, "paddle_dy")
1003 1004
        para_state_dict, opt_state_dict = fluid.load_dygraph("paddle_dy")
        adam.set_state_dict(opt_state_dict)
1005 1006 1007

        paddle.enable_static()

1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019
    def test_adam_save_load_error(self):
        paddle.disable_static()

        def get_opt(dtype, shape):
            with paddle.utils.unique_name.guard():
                paddle.set_default_dtype(dtype)
                a = paddle.rand([4, 10])
                linear = paddle.nn.Linear(10, 10)
                b = linear(a)
                state_dict = linear.state_dict()
                fluid.save_dygraph(state_dict, "paddle_dy")

1020 1021 1022
                scheduler = paddle.optimizer.lr.NoamDecay(
                    d_model=0.01, warmup_steps=100, verbose=True
                )
1023 1024 1025
                adam = paddle.fluid.optimizer.Adam(
                    learning_rate=scheduler,
                    parameter_list=linear.parameters(),
1026 1027
                    use_global_beta_pow=True,
                )
1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041
                adam.minimize(b)
                return adam

        adam = get_opt('float32', [10, 10])

        state_dict = adam.state_dict()
        fluid.save_dygraph(state_dict, "paddle_dy")
        para_state_dict, opt_state_dict = fluid.load_dygraph("paddle_dy")
        adam.set_state_dict(opt_state_dict)

        adam2 = get_opt('float64', [10, 10])  # dtype not match
        self.assertRaises(AssertionError, adam2.set_state_dict, opt_state_dict)

        adam3 = get_opt('float32', [10, 10])  # shape not match
1042 1043 1044
        opt_state_dict['beta1_pow_acc_0'] = np.array(
            [0.9, 0.9], dtype='float32'
        )
1045 1046 1047
        self.assertRaises(AssertionError, adam3.set_state_dict, opt_state_dict)
        paddle.enable_static()

1048

1049 1050 1051 1052 1053 1054 1055 1056
class TestAdamOpV2Group(TestAdamOpV2):
    def test_adam_op(self):
        paddle.disable_static()
        value = np.arange(26).reshape(2, 13).astype("float32")
        a = paddle.to_tensor(value)
        linear_1 = paddle.nn.Linear(13, 5)
        linear_2 = paddle.nn.Linear(5, 3)
        # This can be any optimizer supported by dygraph.
1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069
        adam = paddle.optimizer.Adam(
            learning_rate=0.01,
            parameters=[
                {'params': linear_1.parameters()},
                {
                    'params': linear_2.parameters(),
                    'weight_decay': 0.001,
                    'beta1': 0.1,
                    'beta2': 0.99,
                },
            ],
            weight_decay=0.1,
        )
1070 1071 1072 1073 1074 1075 1076
        out = linear_1(a)
        out = linear_2(out)
        out.backward()
        adam.step()
        adam.clear_gradients()


Z
zhangbo9674 已提交
1077
class TestMultiTensorAdam(unittest.TestCase):
1078 1079 1080 1081 1082 1083 1084 1085
    def _adam_optimize_dygraph(
        self,
        place,
        use_param_attr=False,
        use_param_group=False,
        use_amp=False,
        use_multi_tensor=False,
    ):
Z
zhangbo9674 已提交
1086 1087 1088 1089 1090 1091 1092 1093 1094
        paddle.disable_static()
        paddle.seed(10)
        paddle.set_device(place)

        input = paddle.randn((5, 5))

        weight_attr = paddle.ParamAttr(
            learning_rate=0.5,
            regularizer=paddle.regularizer.L2Decay(1.0),
1095 1096
            trainable=True,
        )
Z
zhangbo9674 已提交
1097 1098 1099 1100 1101 1102
        if use_param_attr:
            model = paddle.nn.Linear(5, 5, weight_attr)
        else:
            model = paddle.nn.Linear(5, 5)

        if not use_param_group:
1103 1104 1105 1106 1107
            optimizer = paddle.optimizer.Adam(
                parameters=model.parameters(),
                use_multi_tensor=use_multi_tensor,
                multi_precision=use_amp,
            )
Z
zhangbo9674 已提交
1108
        else:
1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128
            parameters = list(model.parameters())
            param_num = len(parameters)
            optimizer = paddle.optimizer.Adam(
                parameters=[
                    {
                        'params': parameters[: int(param_num / 2)],
                        'weight_decay': 0.001,
                        'beta1': 0.1,
                        'beta2': 0.99,
                    },
                    {
                        'params': parameters[int(param_num / 2) :],
                        'weight_decay': 0.001,
                        'beta1': 0.1,
                        'beta2': 0.99,
                    },
                ],
                use_multi_tensor=use_multi_tensor,
                multi_precision=use_amp,
            )
Z
zhangbo9674 已提交
1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151

        for idx in range(2):
            if place == 'gpu' and use_amp == True:
                model = paddle.amp.decorate(models=model, level='O2')
                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)

            if place == 'gpu' and use_amp == True:
                with paddle.amp.auto_cast(level='O2'):
                    output = model(input)
                    loss = paddle.mean(output)
                scaled = scaler.scale(loss)
                scaled.backward()
                scaler.step(optimizer)
                optimizer.clear_grad()
            else:
                output = model(input)
                loss = paddle.mean(output)
                loss.backward()
                optimizer.step()
                optimizer.clear_grad()

        return output, model.parameters()

1152 1153 1154
    def _adam_optimize_static(
        self, place, use_amp=False, use_multi_tensor=False
    ):
Z
zhangbo9674 已提交
1155 1156 1157 1158 1159 1160 1161 1162
        paddle.enable_static()
        paddle.seed(10)
        np.random.seed(10)
        if place == 'cpu':
            use_amp = False
        exe = paddle.static.Executor(place=place)
        train_program = paddle.static.Program()
        startup_program = paddle.static.Program()
1163 1164 1165
        optimizer = paddle.optimizer.Adam(
            multi_precision=use_amp, use_multi_tensor=use_multi_tensor
        )
Z
zhangbo9674 已提交
1166 1167 1168 1169 1170 1171
        if use_amp:
            optimizer = paddle.static.amp.decorate(
                optimizer,
                init_loss_scaling=128.0,
                use_dynamic_loss_scaling=True,
                use_pure_fp16=True,
1172 1173
                use_fp16_guard=False,
            )
Z
zhangbo9674 已提交
1174 1175
        with paddle.static.program_guard(train_program, startup_program):
            if use_amp:
1176 1177 1178
                data = paddle.static.data(
                    shape=[2, 2], name='X', dtype='float16'
                )
Z
zhangbo9674 已提交
1179
            else:
1180 1181 1182
                data = paddle.static.data(
                    shape=[2, 2], name='X', dtype='float32'
                )
Z
zhangbo9674 已提交
1183
            hidden = paddle.static.nn.fc(x=data, size=10)
1184
            loss = paddle.mean(hidden)
Z
zhangbo9674 已提交
1185 1186 1187 1188 1189 1190 1191 1192 1193
            optimizer.minimize(loss)
        exe.run(startup_program)
        if use_amp:
            optimizer.amp_init(place=place, scope=paddle.static.global_scope())
            x = np.random.random(size=(2, 2)).astype('float16')
        else:
            x = np.random.random(size=(2, 2)).astype('float32')
        out = []
        for idx in range(5):
1194 1195 1196
            (loss_data,) = exe.run(
                train_program, feed={"X": x}, fetch_list=[loss.name]
            )
Z
zhangbo9674 已提交
1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208
            out.append(loss_data)
        return out

    def _get_places(self):
        places = ['cpu']
        if paddle.is_compiled_with_cuda():
            places.append('gpu')
        return places

    def _check_with_place_amp(self, place, use_amp):
        # test dygraph mode
        output_dygraph1, params_dygraph1 = self._adam_optimize_dygraph(
1209 1210
            place=place, use_amp=use_amp, use_multi_tensor=True
        )
Z
zhangbo9674 已提交
1211
        output_dygraph2, params_dygraph2 = self._adam_optimize_dygraph(
1212 1213
            place=place, use_amp=use_amp, use_multi_tensor=False
        )
1214
        np.testing.assert_allclose(output_dygraph1, output_dygraph2, rtol=1e-05)
Z
zhangbo9674 已提交
1215
        for idx in range(len(params_dygraph1)):
1216 1217 1218
            np.testing.assert_allclose(
                params_dygraph1[idx], params_dygraph2[idx], rtol=1e-05
            )
Z
zhangbo9674 已提交
1219
        # test static mode
1220 1221 1222 1223 1224 1225
        output_static1 = self._adam_optimize_static(
            place=place, use_amp=use_amp, use_multi_tensor=True
        )
        output_static2 = self._adam_optimize_static(
            place=place, use_amp=use_amp, use_multi_tensor=False
        )
Z
zhangbo9674 已提交
1226
        for idx in range(len(output_static1)):
1227 1228 1229
            np.testing.assert_allclose(
                output_static1[idx], output_static2[idx], rtol=1e-05
            )
Z
zhangbo9674 已提交
1230 1231

    def _check_with_param_arrt(self, place, use_amp):
1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243
        output1, params1 = self._adam_optimize_dygraph(
            place=place,
            use_amp=use_amp,
            use_param_attr=True,
            use_multi_tensor=True,
        )
        output2, params2 = self._adam_optimize_dygraph(
            place=place,
            use_amp=use_amp,
            use_param_attr=True,
            use_multi_tensor=False,
        )
Z
zhangbo9674 已提交
1244

1245
        np.testing.assert_allclose(output1, output2, rtol=1e-05)
Z
zhangbo9674 已提交
1246
        for idx in range(len(params1)):
1247
            np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
Z
zhangbo9674 已提交
1248 1249

    def _check_with_param_group(self, place, use_amp):
1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261
        output1, params1 = self._adam_optimize_dygraph(
            place=place,
            use_amp=use_amp,
            use_param_group=True,
            use_multi_tensor=True,
        )
        output2, params2 = self._adam_optimize_dygraph(
            place=place,
            use_amp=use_amp,
            use_param_group=True,
            use_multi_tensor=False,
        )
Z
zhangbo9674 已提交
1262

1263
        np.testing.assert_allclose(output1, output2, rtol=1e-05)
Z
zhangbo9674 已提交
1264
        for idx in range(len(params1)):
1265
            np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
Z
zhangbo9674 已提交
1266 1267 1268 1269 1270 1271 1272 1273 1274

    def test_main(self):
        for place in self._get_places():
            use_amp_list = [True, False]
            for use_amp in use_amp_list:
                self._check_with_place_amp(place, use_amp)
                self._check_with_param_arrt(place, use_amp)
                self._check_with_param_group(place, use_amp)

1275 1276 1277 1278
    def test_api_eager_dygraph(self):
        with _test_eager_guard():
            self.test_main()

Z
zhangbo9674 已提交
1279

1280
if __name__ == "__main__":
H
hong 已提交
1281
    paddle.enable_static()
1282
    unittest.main()