test_adam_op.py 44.4 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2
#
D
dzhwinter 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
D
dzhwinter 已提交
9 10 11 12 13 14
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import unittest
16

17
import numpy as np
18
from op_test import OpTest
19

M
MRXLT 已提交
20
import paddle
21 22
import paddle.fluid as fluid
from paddle.fluid import core
C
chentianyu03 已提交
23
from paddle.fluid.framework import _test_eager_guard
24
from paddle.fluid.op import Operator
25 26 27 28


class TestAdamOp1(OpTest):
    def setUp(self):
29
        '''Test Adam Op with supplied attributes'''
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")

        learning_rate = 0.004
        beta1 = 0.78
        beta2 = 0.836
        epsilon = 1e-4
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
51
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
52 53 54 55
        }

        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}

56
        param_out, moment1_out, moment2_out = adam_step(self.inputs, self.attrs)
57 58 59 60

        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
A
Aurelius84 已提交
61 62
            'ParamOut': param_out,
            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
63
            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
64 65 66 67 68 69 70
        }

    def test_check_output(self):
        self.check_output()


class TestAdamOp2(OpTest):
71 72 73
    def set_shape(self):
        self.shape = (102, 105)

74
    def setUp(self):
75
        '''Test Adam Op with supplied attributes'''
76
        self.op_type = "adam"
77 78 79 80
        self.set_shape()
        param = np.random.uniform(-1, 1, self.shape).astype("float32")
        grad = np.random.uniform(-1, 1, self.shape).astype("float32")
        moment1 = np.random.uniform(-1, 1, self.shape).astype("float32")
81
        # The second moment is positive
82
        moment2 = np.random.random(self.shape).astype("float32")
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97

        learning_rate = 0.001
        beta1 = 0.9
        beta2 = 0.999
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
98
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
99 100 101 102
        }

        attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}

103
        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
104 105 106 107

        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
A
Aurelius84 已提交
108 109
            'ParamOut': param_out,
            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
110
            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
111 112 113 114 115 116
        }

    def test_check_output(self):
        self.check_output()


117 118
class TestAdamOnlyTailOp(TestAdamOp2):
    def set_shape(self):
119
        self.shape = 3
120 121


122 123
class TestAdamOpMultipleSteps(OpTest):
    def setUp(self):
124
        '''Test Adam Operator with supplied attributes'''
125 126 127 128 129 130 131 132 133 134
        self.op_type = "adam"
        self.num_steps = 10

        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")

        learning_rate = 0.001
A
Aurelius84 已提交
135 136
        self.beta1 = 0.9
        self.beta2 = 0.999
137
        epsilon = 1e-8
A
Aurelius84 已提交
138 139
        self.beta1_pow = self.beta1**10
        self.beta2_pow = self.beta2**10
140 141 142 143 144 145 146

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
A
Aurelius84 已提交
147
            'Beta1Pow': np.array([self.beta1_pow]).astype("float32"),
148
            'Beta2Pow': np.array([self.beta2_pow]).astype("float32"),
149 150
        }

A
Aurelius84 已提交
151 152 153
        self.attrs = {
            'epsilon': epsilon,
            'beta1': self.beta1,
154
            'beta2': self.beta2,
A
Aurelius84 已提交
155
        }
156 157 158

    def test_check_output(self):
        for _ in range(self.num_steps):
159 160 161
            param_out, moment1_out, moment2_out = adam_step(
                self.inputs, self.attrs
            )
162

A
Aurelius84 已提交
163 164
            beta1_pow_out = self.inputs['Beta1Pow'] * self.beta1
            beta2_pow_out = self.inputs['Beta2Pow'] * self.beta2
165 166 167
            self.outputs = {
                'Moment1Out': moment1_out,
                'Moment2Out': moment2_out,
A
Aurelius84 已提交
168 169
                'ParamOut': param_out,
                'Beta1PowOut': beta1_pow_out,
170
                'Beta2PowOut': beta2_pow_out,
171 172 173 174 175 176 177 178 179
            }

            # Verify output for this step
            self.check_output()

            # Output of this step becomes input for next step
            self.inputs['Param'] = param_out
            self.inputs['Moment1'] = moment1_out
            self.inputs['Moment2'] = moment2_out
180 181

            # Update powers of Beta1 and Beta2 for next time step
A
Aurelius84 已提交
182 183
            self.inputs['Beta1Pow'] = beta1_pow_out
            self.inputs['Beta2Pow'] = beta2_pow_out
184 185

            # Randomize gradient for next step
186 187 188
            self.inputs['Grad'] = np.random.uniform(-1, 1, (102, 105)).astype(
                "float32"
            )
189

C
chentianyu03 已提交
190 191 192 193
    def test_api_eager_dygraph(self):
        with _test_eager_guard():
            self.test_check_output()

194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212

def adam_step(inputs, attributes):
    '''
    Simulate one step of the adam optimizer
    :param inputs: dict of inputs
    :param attributes: dict of attributes
    :return tuple: tuple of output param, moment1, moment2,
    beta1 power accumulator and beta2 power accumulator
    '''
    param = inputs['Param']
    grad = inputs['Grad']
    moment1 = inputs['Moment1']
    moment2 = inputs['Moment2']
    lr = inputs['LearningRate']
    beta1_pow = inputs['Beta1Pow']
    beta2_pow = inputs['Beta2Pow']

    epsilon = attributes['epsilon']

213 214 215 216 217 218 219 220 221
    if 'beta1' in attributes:
        beta1 = attributes['beta1']
    else:
        beta1 = inputs['Beta1Tensor'][0]
    if 'beta2' in attributes:
        beta2 = attributes['beta2']
    else:
        beta2 = inputs['Beta2Tensor'][0]

222 223
    moment1_out = beta1 * moment1 + (1 - beta1) * grad
    moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
224
    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
225
    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))
226
    return param_out, moment1_out, moment2_out
227 228


R
Roc 已提交
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
def adamw_step(inputs, attributes):
    '''
    Simulate one step of the adam optimizer
    :param inputs: dict of inputs
    :param attributes: dict of attributes
    :return tuple: tuple of output param, moment1, moment2,
    beta1 power accumulator and beta2 power accumulator
    '''
    param = inputs['Param']
    grad = inputs['Grad']
    moment1 = inputs['Moment1']
    moment2 = inputs['Moment2']
    lr = inputs['LearningRate']
    beta1_pow = inputs['Beta1Pow']
    beta2_pow = inputs['Beta2Pow']

    epsilon = attributes['epsilon']
    coeff = attributes["coeff"]
    if attributes.get("with_decay", False):
        decay = 1.0 - lr * coeff
        param2 = param * decay
        param = param2.copy()
    if 'beta1' in attributes:
        beta1 = attributes['beta1']
    else:
        beta1 = inputs['Beta1Tensor'][0]
    if 'beta2' in attributes:
        beta2 = attributes['beta2']
    else:
        beta2 = inputs['Beta2Tensor'][0]

    moment1_out = beta1 * moment1 + (1 - beta1) * grad
    moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))

    return param_out, moment1_out, moment2_out


268 269 270
def adam_step_sparse(
    inputs, attributes, height, rows, row_numel, np_grad, lazy_mode
):
T
wip  
typhoonzero 已提交
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
    '''
    Simulate one step of the adam optimizer
    :param inputs: dict of inputs
    :param attributes: dict of attributes
    :return tuple: tuple of output param, moment1, moment2,
    beta1 power accumulator and beta2 power accumulator
    '''
    param = inputs['Param']
    # grad = inputs['Grad']
    moment1 = inputs['Moment1']
    moment2 = inputs['Moment2']
    lr = inputs['LearningRate']
    beta1_pow = inputs['Beta1Pow']
    beta2_pow = inputs['Beta2Pow']

    beta1 = attributes['beta1']
    beta2 = attributes['beta2']
    epsilon = attributes['epsilon']

T
typhoonzero 已提交
290 291 292
    moment1_out = np.zeros(shape=[height, row_numel])
    moment2_out = np.zeros(shape=[height, row_numel])
    param_out = np.zeros(shape=[height, row_numel])
T
wip  
typhoonzero 已提交
293

Q
Qiao Longfei 已提交
294
    def update_row(row_id, update_value):
295 296 297 298 299 300
        moment1_out[row_id] = (
            beta1 * moment1[row_id] + (1 - beta1) * update_value
        )
        moment2_out[row_id] = beta2 * moment2[row_id] + (1 - beta2) * np.square(
            update_value
        )
T
wip  
typhoonzero 已提交
301
        lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
302
        param_out[row_id] = param[row_id] - lr_t * (
303 304
            moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon)
        )
Q
Qiao Longfei 已提交
305 306 307 308 309 310 311 312 313 314 315

    if lazy_mode:
        for idx, row_id in enumerate(rows):
            update_row(row_id, np_grad[idx])
    else:
        for row_id in range(param_out.shape[0]):
            update_value = np.zeros(np_grad[0].shape).astype("float32")
            if row_id in rows:
                update_value = np_grad[rows.index(row_id)]
            update_row(row_id, update_value)

T
wip  
typhoonzero 已提交
316 317 318 319
    return param_out, moment1_out, moment2_out


class TestSparseAdamOp(unittest.TestCase):
Q
Qiao Longfei 已提交
320
    def setup(self, scope, place, lazy_mode):
T
wip  
typhoonzero 已提交
321 322 323
        beta1 = 0.78
        beta2 = 0.836
        epsilon = 1e-4
A
Aurelius84 已提交
324 325
        beta1_pow = np.array([beta1**10]).astype("float32")
        beta2_pow = np.array([beta2**10]).astype("float32")
T
wip  
typhoonzero 已提交
326 327 328

        height = 10
        rows = [0, 4, 7]
T
typhoonzero 已提交
329
        self.rows = rows
T
wip  
typhoonzero 已提交
330
        row_numel = 12
T
typhoonzero 已提交
331
        self.row_numel = row_numel
T
wip  
typhoonzero 已提交
332
        self.dense_inputs = {
Q
Qiao Longfei 已提交
333 334 335
            "Param": np.full((height, row_numel), 5.0).astype("float32"),
            "Moment1": np.full((height, row_numel), 5.0).astype("float32"),
            "Moment2": np.full((height, row_numel), 5.0).astype("float32"),
A
Aurelius84 已提交
336 337
            'Beta1Pow': beta1_pow,
            'Beta2Pow': beta2_pow,
338
            "LearningRate": np.full((1), 2.0).astype("float32"),
T
wip  
typhoonzero 已提交
339
        }
Q
Qiao Longfei 已提交
340
        self.init_output = np.full((height, row_numel), 0.0).astype("float32")
341 342 343 344
        self.attrs = {
            'epsilon': epsilon,
            'beta1': beta1,
            'beta2': beta2,
345
            'min_row_size_to_use_multithread': 2,
346
        }
T
wip  
typhoonzero 已提交
347 348 349 350 351 352 353 354 355 356 357 358 359

        grad_selected_rows = scope.var('Grad').get_selected_rows()
        grad_selected_rows.set_height(height)
        grad_selected_rows.set_rows(rows)
        np_array = np.ones((len(rows), row_numel)).astype("float32")
        np_array[0, 0] = 2.0
        np_array[2, 8] = 4.0

        grad_tensor = grad_selected_rows.get_tensor()
        grad_tensor.set(np_array, place)

        self.sparse_inputs = ["Grad"]

360 361 362 363 364 365 366 367 368
        param_out, mom1, mom2 = adam_step_sparse(
            self.dense_inputs,
            self.attrs,
            height,
            rows,
            row_numel,
            np_array,
            lazy_mode,
        )
T
wip  
typhoonzero 已提交
369
        self.outputs = {
T
typhoonzero 已提交
370
            "ParamOut": param_out,
T
wip  
typhoonzero 已提交
371
            "Moment1Out": mom1,
A
Aurelius84 已提交
372 373
            "Moment2Out": mom2,
            'Beta1PowOut': beta1_pow * beta1,
374
            'Beta2PowOut': beta2_pow * beta2,
T
wip  
typhoonzero 已提交
375 376
        }

Q
Qiao Longfei 已提交
377
    def check_with_place(self, place, lazy_mode):
T
wip  
typhoonzero 已提交
378
        scope = core.Scope()
Q
Qiao Longfei 已提交
379
        self.setup(scope, place, lazy_mode)
T
wip  
typhoonzero 已提交
380 381

        op_args = dict()
Q
Qiao Longfei 已提交
382
        op_args['lazy_mode'] = lazy_mode
383
        for key, np_array in self.dense_inputs.items():
T
wip  
typhoonzero 已提交
384 385 386 387 388
            var = scope.var(key).get_tensor()
            var.set(np_array, place)
            op_args[key] = key
        for s in self.sparse_inputs:
            op_args[s] = s
T
typhoonzero 已提交
389 390
        for s in self.outputs:
            var = scope.var(s).get_tensor()
Q
Qiao Longfei 已提交
391
            var.set(self.init_output, place)
T
typhoonzero 已提交
392
            op_args[s] = s
T
wip  
typhoonzero 已提交
393 394 395 396
        for k in self.attrs:
            op_args[k] = self.attrs[k]

        # create and run sgd operator
T
typhoonzero 已提交
397 398
        adam_op = Operator("adam", **op_args)
        adam_op.run(scope, place)
T
wip  
typhoonzero 已提交
399

400
        for key, np_array in self.outputs.items():
T
wip  
typhoonzero 已提交
401 402
            out_var = scope.var(key).get_tensor()
            actual = np.array(out_var)
T
typhoonzero 已提交
403 404
            actual = actual.reshape([actual.size])
            np_array = np_array.reshape([np_array.size])
Q
Qiao Longfei 已提交
405 406 407

            for i in range(np_array.size):
                self.assertLess((actual[i] - np_array[i]), 0.00001)
T
wip  
typhoonzero 已提交
408

Q
Qiao Longfei 已提交
409
    def test_sparse_adam(self):
T
wip  
typhoonzero 已提交
410
        places = [core.CPUPlace()]
411
        if core.is_compiled_with_cuda():
T
wip  
typhoonzero 已提交
412 413
            places.append(core.CUDAPlace(0))
        for place in places:
Q
Qiao Longfei 已提交
414 415
            for lazy_mode in (True, False):
                self.check_with_place(place, lazy_mode)
T
wip  
typhoonzero 已提交
416 417


418 419
class TestAdamOpBetaVariable(OpTest):
    def setUp(self):
420
        '''Test Adam Op with beta as Variable'''
421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")
        beta1 = 0.85
        beta2 = 0.95

        learning_rate = 0.001
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
            "Beta1Tensor": np.array([beta1]).astype("float32"),
            "Beta2Tensor": np.array([beta2]).astype("float32"),
        }

        attributes = {'epsilon': epsilon}

449
        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
450 451 452 453

        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
A
Aurelius84 已提交
454 455
            'ParamOut': param_out,
            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
456
            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
457 458 459 460 461 462
        }

    def test_check_output(self):
        self.check_output()


463 464
class TestAdamOpBetaEpsilonVariable(OpTest):
    def setUp(self):
465
        '''Test Adam Op with beta/epsilon as Variable'''
466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")
        beta1 = 0.85
        beta2 = 0.95

        learning_rate = 0.001
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
            "Beta1Tensor": np.array([beta1]).astype("float32"),
            "Beta2Tensor": np.array([beta2]).astype("float32"),
            "EpsilonTensor": np.array([epsilon]).astype("float32"),
        }

        attributes = {'epsilon': epsilon}

495
        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
496 497 498 499 500 501

        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
            'ParamOut': param_out,
            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
502
            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
503 504 505 506 507 508
        }

    def test_check_output(self):
        self.check_output()


509 510
class TestAdamOpWithGlobalBetaPow(OpTest):
    def setUp(self):
511
        '''Test Adam Op with global_beta_pow'''
512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")
        beta1 = 0.85
        beta2 = 0.95

        learning_rate = 0.001
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
            "Beta1Tensor": np.array([beta1]).astype("float32"),
            "Beta2Tensor": np.array([beta2]).astype("float32"),
            "EpsilonTensor": np.array([epsilon]).astype("float32"),
        }

        attributes = {'epsilon': epsilon}

541
        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
542 543 544 545 546 547 548 549 550

        self.attrs = {'use_global_beta_pow': True}

        # use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
            'ParamOut': param_out,
            'Beta1PowOut': np.array([]),
551
            'Beta2PowOut': np.array([]),
552 553 554 555 556 557
        }

    def test_check_output(self):
        self.check_output()


558 559
class TestAdamOpWithSkipUpdate(OpTest):
    def setUp(self):
560
        '''Test Adam Op with global_beta_pow'''
561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")
        beta1 = 0.85
        beta2 = 0.95

        learning_rate = 0.001
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
            "Beta1Tensor": np.array([beta1]).astype("float32"),
            "Beta2Tensor": np.array([beta2]).astype("float32"),
            "EpsilonTensor": np.array([epsilon]).astype("float32"),
            "SkipUpdate": np.array([True]).astype("bool"),
        }

        attributes = {'epsilon': epsilon}

        self.attrs = {'use_global_beta_pow': True}

        # use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
        self.outputs = {
            'Moment1Out': moment1,
            'Moment2Out': moment2,
            'ParamOut': param,
            'Beta1PowOut': self.inputs['Beta1Pow'],
            'Beta2PowOut': self.inputs['Beta2Pow'],
        }

    def test_check_output(self):
        self.check_output()


M
MRXLT 已提交
606 607 608
class TestAdamOpV2(unittest.TestCase):
    def test_adam_op(self):
        place = fluid.CPUPlace()
609
        shape = [2, 3, 8, 8]
M
MRXLT 已提交
610 611 612 613 614 615
        exe = fluid.Executor(place)
        train_prog = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(train_prog, startup):
            with fluid.unique_name.guard():
                data = fluid.data(name="data", shape=shape)
616
                conv = paddle.static.nn.conv2d(data, 8, 3)
617
                loss = paddle.mean(conv)
M
MRXLT 已提交
618

619
                beta1 = paddle.static.create_global_var(
620 621
                    shape=[1], value=0.85, dtype='float32', persistable=True
                )
622
                beta2 = paddle.static.create_global_var(
623 624
                    shape=[1], value=0.95, dtype='float32', persistable=True
                )
M
MRXLT 已提交
625
                betas = [beta1, beta2]
626 627 628 629 630 631 632
                opt = paddle.optimizer.Adam(
                    learning_rate=1e-5,
                    beta1=beta1,
                    beta2=beta2,
                    weight_decay=0.01,
                    epsilon=1e-8,
                )
M
MRXLT 已提交
633 634 635 636 637 638 639 640 641 642 643
                opt.minimize(loss)

        exe.run(startup)
        data_np = np.random.random(shape).astype('float32')
        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
        assert rets[0] is not None

    def test_adam_op_dygraph(self):
        paddle.disable_static()
        value = np.arange(26).reshape(2, 13).astype("float32")
        a = fluid.dygraph.to_variable(value)
644
        linear = paddle.nn.Linear(13, 5)
M
MRXLT 已提交
645

646 647 648
        adam = paddle.optimizer.Adam(
            learning_rate=0.01, parameters=linear.parameters()
        )
M
MRXLT 已提交
649 650 651 652
        out = linear(a)
        out.backward()
        adam.step()
        adam.clear_gradients()
653
        paddle.enable_static()
M
MRXLT 已提交
654 655 656 657

    def test_adam_op_with_state_dict(self):

        paddle.disable_static()
T
tangwei12 已提交
658
        emb = paddle.nn.Embedding(10, 10)
M
MRXLT 已提交
659 660 661 662 663

        adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
        state_dict = adam.state_dict()
        adam.set_state_dict(state_dict)

664
        # learning_rate is LRScheduler
665
        learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(
666 667
            learning_rate=0.1, T_max=10
        )
M
MRXLT 已提交
668 669 670
        adam = paddle.optimizer.Adam(
            learning_rate=learning_rate,
            weight_decay=fluid.regularizer.L2Decay(0.001),
671 672
            parameters=emb.parameters(),
        )
M
MRXLT 已提交
673 674 675 676
        lr = adam.get_lr()
        state_dict = adam.state_dict()
        adam.set_state_dict(state_dict)

677
        # leanrning_rate is Tensor
M
MRXLT 已提交
678 679 680
        with self.assertRaises(TypeError):
            learning_rate = np.array([0.01]).astype("float32")
            learning_rate = paddle.to_tensor(learning_rate)
681 682 683
            adam = paddle.optimizer.Adam(
                learning_rate=learning_rate, parameters=emb.parameters()
            )
M
MRXLT 已提交
684 685

        params = adam.get_opti_var_name_list()
686
        assert params is not None
687
        paddle.enable_static()
M
MRXLT 已提交
688 689 690 691 692

    def test_adam_with_grad_clip(self):
        paddle.disable_static()
        value = np.arange(26).reshape(2, 13).astype("float32")
        a = fluid.dygraph.to_variable(value)
693
        linear = paddle.nn.Linear(13, 5)
M
MRXLT 已提交
694
        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
695 696 697
        adam = paddle.optimizer.Adam(
            0.1, parameters=linear.parameters(), grad_clip=clip
        )
M
MRXLT 已提交
698 699 700 701
        out = linear(a)
        out.backward()
        adam.step()
        adam.clear_gradients()
702
        paddle.enable_static()
M
MRXLT 已提交
703 704 705 706 707 708 709 710 711

    def test_adam_op_with_set_lr(self):
        paddle.disable_static()
        linear = paddle.nn.Linear(10, 10)
        adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())

        lr = 0.01
        adam.set_lr(lr)
        cur_lr = adam.get_lr()
712
        assert lr == cur_lr
M
MRXLT 已提交
713
        with self.assertRaises(TypeError):
714
            lr_var = paddle.static.create_global_var(
715 716
                shape=[1], value=lr, dtype='float32'
            )
717
            adam.set_lr(lr_var)
718
        paddle.enable_static()
719

M
MRXLT 已提交
720 721 722 723
    def test_adam_op_invalid_input(self):
        paddle.disable_static()
        linear = paddle.nn.Linear(10, 10)
        with self.assertRaises(ValueError):
724 725 726
            adam = paddle.optimizer.Adam(
                0.1, beta1=-1, parameters=linear.parameters()
            )
M
MRXLT 已提交
727
        with self.assertRaises(ValueError):
728 729 730
            adam = paddle.optimizer.Adam(
                0.1, beta2=-1, parameters=linear.parameters()
            )
M
MRXLT 已提交
731
        with self.assertRaises(ValueError):
732 733 734
            adam = paddle.optimizer.Adam(
                0.1, epsilon=-1, parameters=linear.parameters()
            )
735
        paddle.enable_static()
M
MRXLT 已提交
736

737 738 739 740 741 742
    def test_adam_op_with_sparse_input_and_weight_decay(self):

        paddle.disable_static()
        x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64)
        x = paddle.to_tensor(x_data, stop_gradient=False)
        emb = paddle.nn.Embedding(10, 10, sparse=True)
743 744 745
        adam = paddle.optimizer.Adam(
            0.001, parameters=emb.parameters(), weight_decay=0.01
        )
746 747 748 749 750

        with self.assertRaises(RuntimeError):
            out = emb(x)
            out.backward()
            adam.step()
751
        paddle.enable_static()
752

C
chentianyu03 已提交
753 754 755 756 757 758 759 760
    def test_api_eager_dygraph(self):
        with _test_eager_guard():
            self.test_adam_op_dygraph()
            self.test_adam_op_with_state_dict()
            self.test_adam_with_grad_clip()
            self.test_adam_op_with_set_lr()
            self.test_adam_op_with_sparse_input_and_weight_decay()

761

762
class TestAdamOptimizer(unittest.TestCase):
763 764 765 766 767 768 769 770
    def _test(
        self,
        place,
        use_tensor=True,
        use_fluid_api=True,
        use_global_beta_pow=False,
        flatten_param_grads=False,
    ):
771 772 773 774 775 776 777
        paddle.enable_static()
        main_prog = paddle.static.Program()
        startup_prog = paddle.static.Program()
        SEED = 2021
        paddle.seed(SEED)
        np.random.seed(SEED)

778 779 780 781 782 783
        a_np = np.random.random(size=(2, 2)).astype('float32')
        b_np = np.random.random(size=(2, 2)).astype('float32')
        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
        weight_attr1 = paddle.ParamAttr(
            name="weight1",
            initializer=fluid.initializer.Constant(value=1.0),
784 785
            trainable=True,
        )
786 787 788
        weight_attr2 = paddle.ParamAttr(
            name="weight2",
            initializer=fluid.initializer.Constant(value=2.0),
789 790
            trainable=True,
        )
791
        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
792 793

        with paddle.static.program_guard(main_prog, startup_prog):
794 795 796
            with paddle.utils.unique_name.guard():
                a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
                b = paddle.static.data(name="b", shape=[2, 2], dtype='float32')
797 798 799
                label = paddle.static.data(
                    name="label", shape=[2, 1], dtype='int64'
                )
800 801 802 803 804

                sum = paddle.add(a, b)
                z = paddle.pow(sum, 2.0)

                fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1)
805 806 807
                prediction = fluid.layers.fc(
                    input=fc_1, size=2, param_attr=weight_attr2, act='softmax'
                )
808

809 810 811 812 813 814
                cost = paddle.nn.functional.cross_entropy(
                    input=prediction,
                    label=label,
                    reduction='none',
                    use_softmax=False,
                )
815
                loss = paddle.mean(cost)
816 817 818 819
                beta1_init = 0.9
                beta2_init = 0.999
                epsilon_init = 1e-8
                if use_tensor:
820
                    beta1 = paddle.static.create_global_var(
821 822 823 824
                        shape=[1],
                        value=float(beta1_init),
                        dtype='float32',
                        persistable=True,
825 826
                        name="beta1",
                    )
827
                    beta2 = paddle.static.create_global_var(
828 829 830 831
                        shape=[1],
                        value=float(beta2_init),
                        dtype='float32',
                        persistable=True,
832 833
                        name="beta2",
                    )
834
                    epsilon = paddle.static.create_global_var(
835 836 837 838
                        shape=[1],
                        value=float(epsilon_init),
                        dtype='float32',
                        persistable=True,
839 840
                        name="epsilon",
                    )
841 842 843 844 845 846 847 848 849
                    if use_fluid_api:
                        adam = fluid.optimizer.Adam(
                            learning_rate=0.01,
                            beta1=beta1,
                            beta2=beta2,
                            epsilon=epsilon,
                            use_global_beta_pow=use_global_beta_pow,
                            flatten_param_grads=flatten_param_grads,
                            align_size=256,
850 851
                            grad_clip=clip,
                        )
852
                    else:
853 854 855 856 857 858 859
                        adam = paddle.optimizer.Adam(
                            learning_rate=0.01,
                            beta1=beta1,
                            beta2=beta2,
                            epsilon=epsilon,
                            grad_clip=clip,
                        )
860
                else:
861 862 863 864 865 866 867 868 869
                    if use_fluid_api:
                        adam = fluid.optimizer.Adam(
                            learning_rate=0.01,
                            beta1=beta1_init,
                            beta2=beta2_init,
                            epsilon=epsilon_init,
                            use_global_beta_pow=use_global_beta_pow,
                            flatten_param_grads=flatten_param_grads,
                            align_size=256,
870 871
                            grad_clip=clip,
                        )
872
                    else:
873 874 875 876 877 878 879
                        adam = fluid.optimizer.Adam(
                            learning_rate=0.01,
                            beta1=beta1_init,
                            beta2=beta2_init,
                            epsilon=epsilon_init,
                            grad_clip=clip,
                        )
880 881 882 883 884 885 886 887 888 889

                adam.minimize(loss)

        scope = fluid.Scope()
        with fluid.scope_guard(scope):
            exe = paddle.static.Executor(place)
            exe.run(startup_prog)

            print("Start run on {}".format(place))
            for epoch in range(10):
890 891 892 893 894 895 896 897 898 899
                pred_res, loss_res = exe.run(
                    main_prog,
                    feed={"a": a_np, "b": b_np, "label": label_np},
                    fetch_list=[prediction, loss],
                )
                print(
                    "Epoch {} | Prediction[0]: {}, Loss: {}".format(
                        epoch, pred_res[0], loss_res
                    )
                )
900 901
            paddle.disable_static()
            return pred_res, loss_res
902 903 904 905 906 907 908

    def _test_with_place(self, place):
        preds = []
        losses = []

        for use_tensor in [True, False]:
            for use_fluid_api in [True, False]:
909
                for use_global_beta_pow in [True, False]:
910
                    for flatten_param_grads in [True, False]:
911 912 913 914 915 916 917
                        pred, loss = self._test(
                            place,
                            use_tensor,
                            use_fluid_api,
                            use_global_beta_pow,
                            flatten_param_grads,
                        )
918 919
                        preds.append(pred)
                        losses.append(loss)
920
        for pred in preds:
921
            np.testing.assert_allclose(pred, preds[0], rtol=1e-05)
922
        for loss in losses:
923
            np.testing.assert_allclose(loss, losses[0], rtol=1e-05)
924 925 926 927 928 929 930

    def test_adam_api(self):
        # NOTE(zhiqiu): cpu and gpu has different seed, so should compare separatly.
        self._test_with_place(paddle.CPUPlace())
        if core.is_compiled_with_cuda():
            self._test_with_place(paddle.CUDAPlace(0))

931 932 933 934 935 936 937 938
    def test_adam_flatten_param_grads_with_regularizer(self):
        # flatten_param_grads + regularizer is not supported yet.
        paddle.enable_static()
        main = fluid.Program()
        weight_attr = paddle.ParamAttr(
            name="weight1",
            initializer=fluid.initializer.Constant(value=1.0),
            regularizer=fluid.regularizer.L1DecayRegularizer(
939 940 941 942
                regularization_coeff=0.1
            ),
            trainable=True,
        )
943 944 945
        with fluid.program_guard(main):
            x = fluid.data(name='x', shape=[None, 13], dtype='float32')
            y = fluid.data(name='y', shape=[None, 1], dtype='float32')
946 947 948
            y_predict = fluid.layers.fc(
                input=x, size=1, act=None, param_attr=weight_attr
            )
949 950 951
            cost = paddle.nn.functional.square_error_cost(
                input=y_predict, label=y
            )
952
            avg_cost = paddle.mean(cost)
953

954 955 956
            adam = fluid.optimizer.AdamOptimizer(
                0.01, flatten_param_grads=True, align_size=256
            )
957 958 959 960 961
            adam.minimize(avg_cost)
            paddle.disable_static()

            self.assertEqual(adam._flatten_param_grads, False)

962 963 964 965 966 967 968 969 970 971 972 973
    def test_adam_exception(self):
        paddle.enable_static()
        a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
        b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
        label = paddle.static.data(name="label", shape=[32, 1], dtype='int64')

        sum = paddle.add(a, b)
        z = paddle.pow(sum, 2.0)

        fc_1 = fluid.layers.fc(input=z, size=128)
        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')

974 975 976
        cost = paddle.nn.functional.cross_entropy(
            input=prediction, label=label, reduction='none', use_softmax=False
        )
977
        loss = paddle.mean(cost)
978 979 980
        adam = fluid.optimizer.Adam(use_global_beta_pow=True)
        adam.minimize(loss)
        self.assertRaises(Exception, adam._get_global_accumulator, 'tmp')
981 982 983
        adam._add_global_accumulator(
            'tmp', type=core.VarDesc.VarType.LOD_TENSOR
        )
984
        adam._get_global_accumulator('tmp')
985 986 987 988 989 990
        self.assertRaises(
            Exception,
            adam._add_global_accumulator,
            adam._beta1_pow_acc_str,
            type=core.VarDesc.VarType.LOD_TENSOR,
        )
991 992 993 994 995 996 997 998 999 1000
        paddle.disable_static()

    def test_adam_save_load(self):
        paddle.disable_static()
        a = paddle.rand([4, 10])
        linear = paddle.nn.Linear(10, 10)
        b = linear(a)
        state_dict = linear.state_dict()
        fluid.save_dygraph(state_dict, "paddle_dy")

1001 1002 1003 1004 1005 1006 1007 1008
        scheduler = paddle.optimizer.lr.NoamDecay(
            d_model=0.01, warmup_steps=100, verbose=True
        )
        adam = paddle.fluid.optimizer.Adam(
            learning_rate=scheduler,
            parameter_list=linear.parameters(),
            use_global_beta_pow=True,
        )
1009 1010 1011
        adam.minimize(b)
        state_dict = adam.state_dict()
        fluid.save_dygraph(state_dict, "paddle_dy")
1012 1013
        para_state_dict, opt_state_dict = fluid.load_dygraph("paddle_dy")
        adam.set_state_dict(opt_state_dict)
1014 1015 1016

        paddle.enable_static()

1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
    def test_adam_save_load_error(self):
        paddle.disable_static()

        def get_opt(dtype, shape):
            with paddle.utils.unique_name.guard():
                paddle.set_default_dtype(dtype)
                a = paddle.rand([4, 10])
                linear = paddle.nn.Linear(10, 10)
                b = linear(a)
                state_dict = linear.state_dict()
                fluid.save_dygraph(state_dict, "paddle_dy")

1029 1030 1031
                scheduler = paddle.optimizer.lr.NoamDecay(
                    d_model=0.01, warmup_steps=100, verbose=True
                )
1032 1033 1034
                adam = paddle.fluid.optimizer.Adam(
                    learning_rate=scheduler,
                    parameter_list=linear.parameters(),
1035 1036
                    use_global_beta_pow=True,
                )
1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
                adam.minimize(b)
                return adam

        adam = get_opt('float32', [10, 10])

        state_dict = adam.state_dict()
        fluid.save_dygraph(state_dict, "paddle_dy")
        para_state_dict, opt_state_dict = fluid.load_dygraph("paddle_dy")
        adam.set_state_dict(opt_state_dict)

        adam2 = get_opt('float64', [10, 10])  # dtype not match
        self.assertRaises(AssertionError, adam2.set_state_dict, opt_state_dict)

        adam3 = get_opt('float32', [10, 10])  # shape not match
1051 1052 1053
        opt_state_dict['beta1_pow_acc_0'] = np.array(
            [0.9, 0.9], dtype='float32'
        )
1054 1055 1056
        self.assertRaises(AssertionError, adam3.set_state_dict, opt_state_dict)
        paddle.enable_static()

1057

1058 1059 1060 1061 1062 1063 1064 1065
class TestAdamOpV2Group(TestAdamOpV2):
    def test_adam_op(self):
        paddle.disable_static()
        value = np.arange(26).reshape(2, 13).astype("float32")
        a = paddle.to_tensor(value)
        linear_1 = paddle.nn.Linear(13, 5)
        linear_2 = paddle.nn.Linear(5, 3)
        # This can be any optimizer supported by dygraph.
1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078
        adam = paddle.optimizer.Adam(
            learning_rate=0.01,
            parameters=[
                {'params': linear_1.parameters()},
                {
                    'params': linear_2.parameters(),
                    'weight_decay': 0.001,
                    'beta1': 0.1,
                    'beta2': 0.99,
                },
            ],
            weight_decay=0.1,
        )
1079 1080 1081 1082 1083 1084 1085
        out = linear_1(a)
        out = linear_2(out)
        out.backward()
        adam.step()
        adam.clear_gradients()


Z
zhangbo9674 已提交
1086
class TestMultiTensorAdam(unittest.TestCase):
1087 1088 1089 1090 1091 1092 1093 1094
    def _adam_optimize_dygraph(
        self,
        place,
        use_param_attr=False,
        use_param_group=False,
        use_amp=False,
        use_multi_tensor=False,
    ):
Z
zhangbo9674 已提交
1095 1096 1097 1098 1099 1100 1101 1102 1103
        paddle.disable_static()
        paddle.seed(10)
        paddle.set_device(place)

        input = paddle.randn((5, 5))

        weight_attr = paddle.ParamAttr(
            learning_rate=0.5,
            regularizer=paddle.regularizer.L2Decay(1.0),
1104 1105
            trainable=True,
        )
Z
zhangbo9674 已提交
1106
        if use_param_attr:
1107
            model = paddle.nn.Linear(5, 5, weight_attr=weight_attr)
Z
zhangbo9674 已提交
1108 1109 1110 1111
        else:
            model = paddle.nn.Linear(5, 5)

        if not use_param_group:
1112 1113 1114 1115 1116
            optimizer = paddle.optimizer.Adam(
                parameters=model.parameters(),
                use_multi_tensor=use_multi_tensor,
                multi_precision=use_amp,
            )
Z
zhangbo9674 已提交
1117
        else:
1118 1119
            parameters = list(model.parameters())
            param_num = len(parameters)
1120 1121 1122
            optimizer = paddle.optimizer.Adam(
                parameters=[
                    {
1123
                        'params': parameters[: int(param_num / 2)],
1124 1125 1126
                        'weight_decay': 0.001,
                        'beta1': 0.1,
                        'beta2': 0.99,
1127 1128 1129 1130 1131 1132 1133
                    },
                    {
                        'params': parameters[int(param_num / 2) :],
                        'weight_decay': 0.001,
                        'beta1': 0.1,
                        'beta2': 0.99,
                    },
1134 1135 1136 1137
                ],
                use_multi_tensor=use_multi_tensor,
                multi_precision=use_amp,
            )
Z
zhangbo9674 已提交
1138 1139

        for idx in range(2):
1140
            if place == 'gpu' and use_amp:
Z
zhangbo9674 已提交
1141 1142 1143
                model = paddle.amp.decorate(models=model, level='O2')
                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)

1144
            if place == 'gpu' and use_amp:
Z
zhangbo9674 已提交
1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160
                with paddle.amp.auto_cast(level='O2'):
                    output = model(input)
                    loss = paddle.mean(output)
                scaled = scaler.scale(loss)
                scaled.backward()
                scaler.step(optimizer)
                optimizer.clear_grad()
            else:
                output = model(input)
                loss = paddle.mean(output)
                loss.backward()
                optimizer.step()
                optimizer.clear_grad()

        return output, model.parameters()

1161 1162 1163
    def _adam_optimize_static(
        self, place, use_amp=False, use_multi_tensor=False
    ):
Z
zhangbo9674 已提交
1164 1165 1166 1167 1168 1169 1170 1171
        paddle.enable_static()
        paddle.seed(10)
        np.random.seed(10)
        if place == 'cpu':
            use_amp = False
        exe = paddle.static.Executor(place=place)
        train_program = paddle.static.Program()
        startup_program = paddle.static.Program()
1172 1173 1174
        optimizer = paddle.optimizer.Adam(
            multi_precision=use_amp, use_multi_tensor=use_multi_tensor
        )
Z
zhangbo9674 已提交
1175 1176 1177 1178 1179 1180
        if use_amp:
            optimizer = paddle.static.amp.decorate(
                optimizer,
                init_loss_scaling=128.0,
                use_dynamic_loss_scaling=True,
                use_pure_fp16=True,
1181 1182
                use_fp16_guard=False,
            )
Z
zhangbo9674 已提交
1183 1184
        with paddle.static.program_guard(train_program, startup_program):
            if use_amp:
1185 1186 1187
                data = paddle.static.data(
                    shape=[2, 2], name='X', dtype='float16'
                )
Z
zhangbo9674 已提交
1188
            else:
1189 1190 1191
                data = paddle.static.data(
                    shape=[2, 2], name='X', dtype='float32'
                )
Z
zhangbo9674 已提交
1192
            hidden = paddle.static.nn.fc(x=data, size=10)
1193
            loss = paddle.mean(hidden)
Z
zhangbo9674 已提交
1194 1195 1196 1197 1198 1199 1200 1201 1202
            optimizer.minimize(loss)
        exe.run(startup_program)
        if use_amp:
            optimizer.amp_init(place=place, scope=paddle.static.global_scope())
            x = np.random.random(size=(2, 2)).astype('float16')
        else:
            x = np.random.random(size=(2, 2)).astype('float32')
        out = []
        for idx in range(5):
1203 1204 1205
            (loss_data,) = exe.run(
                train_program, feed={"X": x}, fetch_list=[loss.name]
            )
Z
zhangbo9674 已提交
1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217
            out.append(loss_data)
        return out

    def _get_places(self):
        places = ['cpu']
        if paddle.is_compiled_with_cuda():
            places.append('gpu')
        return places

    def _check_with_place_amp(self, place, use_amp):
        # test dygraph mode
        output_dygraph1, params_dygraph1 = self._adam_optimize_dygraph(
1218 1219
            place=place, use_amp=use_amp, use_multi_tensor=True
        )
Z
zhangbo9674 已提交
1220
        output_dygraph2, params_dygraph2 = self._adam_optimize_dygraph(
1221 1222
            place=place, use_amp=use_amp, use_multi_tensor=False
        )
1223
        np.testing.assert_allclose(output_dygraph1, output_dygraph2, rtol=1e-05)
Z
zhangbo9674 已提交
1224
        for idx in range(len(params_dygraph1)):
1225 1226 1227
            np.testing.assert_allclose(
                params_dygraph1[idx], params_dygraph2[idx], rtol=1e-05
            )
Z
zhangbo9674 已提交
1228
        # test static mode
1229 1230 1231 1232 1233 1234
        output_static1 = self._adam_optimize_static(
            place=place, use_amp=use_amp, use_multi_tensor=True
        )
        output_static2 = self._adam_optimize_static(
            place=place, use_amp=use_amp, use_multi_tensor=False
        )
Z
zhangbo9674 已提交
1235
        for idx in range(len(output_static1)):
1236 1237 1238
            np.testing.assert_allclose(
                output_static1[idx], output_static2[idx], rtol=1e-05
            )
Z
zhangbo9674 已提交
1239 1240

    def _check_with_param_arrt(self, place, use_amp):
1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252
        output1, params1 = self._adam_optimize_dygraph(
            place=place,
            use_amp=use_amp,
            use_param_attr=True,
            use_multi_tensor=True,
        )
        output2, params2 = self._adam_optimize_dygraph(
            place=place,
            use_amp=use_amp,
            use_param_attr=True,
            use_multi_tensor=False,
        )
Z
zhangbo9674 已提交
1253

1254
        np.testing.assert_allclose(output1, output2, rtol=1e-05)
Z
zhangbo9674 已提交
1255
        for idx in range(len(params1)):
1256
            np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
Z
zhangbo9674 已提交
1257 1258

    def _check_with_param_group(self, place, use_amp):
1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270
        output1, params1 = self._adam_optimize_dygraph(
            place=place,
            use_amp=use_amp,
            use_param_group=True,
            use_multi_tensor=True,
        )
        output2, params2 = self._adam_optimize_dygraph(
            place=place,
            use_amp=use_amp,
            use_param_group=True,
            use_multi_tensor=False,
        )
Z
zhangbo9674 已提交
1271

1272
        np.testing.assert_allclose(output1, output2, rtol=1e-05)
Z
zhangbo9674 已提交
1273
        for idx in range(len(params1)):
1274
            np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
Z
zhangbo9674 已提交
1275 1276 1277 1278 1279 1280 1281 1282 1283

    def test_main(self):
        for place in self._get_places():
            use_amp_list = [True, False]
            for use_amp in use_amp_list:
                self._check_with_place_amp(place, use_amp)
                self._check_with_param_arrt(place, use_amp)
                self._check_with_param_group(place, use_amp)

1284 1285 1286 1287
    def test_api_eager_dygraph(self):
        with _test_eager_guard():
            self.test_main()

Z
zhangbo9674 已提交
1288

1289
if __name__ == "__main__":
H
hong 已提交
1290
    paddle.enable_static()
1291
    unittest.main()