test_adam_op.py 46.4 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2
#
D
dzhwinter 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
D
dzhwinter 已提交
9 10 11 12 13 14
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15 16
from __future__ import print_function

17 18
import unittest
import numpy as np
19
from op_test import OpTest
20 21
from paddle.fluid import core
from paddle.fluid.op import Operator
22
import paddle.fluid as fluid
M
MRXLT 已提交
23
import paddle
C
chentianyu03 已提交
24
from paddle.fluid.framework import _test_eager_guard
25 26 27


class TestAdamOp1(OpTest):
28

29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
    def setUp(self):
        '''Test Adam Op with supplied attributes
        '''
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")

        learning_rate = 0.004
        beta1 = 0.78
        beta2 = 0.836
        epsilon = 1e-4
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([beta2_pow]).astype("float32")
        }

        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}

58 59
        param_out, moment1_out, \
            moment2_out = adam_step(self.inputs, self.attrs)
60 61 62 63

        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
A
Aurelius84 已提交
64 65 66
            'ParamOut': param_out,
            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
67 68 69 70 71 72 73
        }

    def test_check_output(self):
        self.check_output()


class TestAdamOp2(OpTest):
74

75 76 77
    def set_shape(self):
        self.shape = (102, 105)

78 79 80 81
    def setUp(self):
        '''Test Adam Op with supplied attributes
        '''
        self.op_type = "adam"
82 83 84 85
        self.set_shape()
        param = np.random.uniform(-1, 1, self.shape).astype("float32")
        grad = np.random.uniform(-1, 1, self.shape).astype("float32")
        moment1 = np.random.uniform(-1, 1, self.shape).astype("float32")
86
        # The second moment is positive
87
        moment2 = np.random.random(self.shape).astype("float32")
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107

        learning_rate = 0.001
        beta1 = 0.9
        beta2 = 0.999
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([beta2_pow]).astype("float32")
        }

        attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}

108 109
        param_out, moment1_out, \
            moment2_out = adam_step(self.inputs, attributes)
110 111 112 113

        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
A
Aurelius84 已提交
114 115 116
            'ParamOut': param_out,
            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
117 118 119 120 121 122
        }

    def test_check_output(self):
        self.check_output()


123
class TestAdamOnlyTailOp(TestAdamOp2):
124

125 126 127 128
    def set_shape(self):
        self.shape = (3)


129
class TestAdamOpMultipleSteps(OpTest):
130

131 132 133 134 135 136 137 138 139 140 141 142 143
    def setUp(self):
        '''Test Adam Operator with supplied attributes
        '''
        self.op_type = "adam"
        self.num_steps = 10

        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")

        learning_rate = 0.001
A
Aurelius84 已提交
144 145
        self.beta1 = 0.9
        self.beta2 = 0.999
146
        epsilon = 1e-8
A
Aurelius84 已提交
147 148
        self.beta1_pow = self.beta1**10
        self.beta2_pow = self.beta2**10
149 150 151 152 153 154 155

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
A
Aurelius84 已提交
156 157
            'Beta1Pow': np.array([self.beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([self.beta2_pow]).astype("float32")
158 159
        }

A
Aurelius84 已提交
160 161 162 163 164
        self.attrs = {
            'epsilon': epsilon,
            'beta1': self.beta1,
            'beta2': self.beta2
        }
165 166 167

    def test_check_output(self):
        for _ in range(self.num_steps):
168 169
            param_out, moment1_out, \
                moment2_out = adam_step(self.inputs, self.attrs)
170

A
Aurelius84 已提交
171 172
            beta1_pow_out = self.inputs['Beta1Pow'] * self.beta1
            beta2_pow_out = self.inputs['Beta2Pow'] * self.beta2
173 174 175
            self.outputs = {
                'Moment1Out': moment1_out,
                'Moment2Out': moment2_out,
A
Aurelius84 已提交
176 177 178
                'ParamOut': param_out,
                'Beta1PowOut': beta1_pow_out,
                'Beta2PowOut': beta2_pow_out
179 180 181 182 183 184 185 186 187
            }

            # Verify output for this step
            self.check_output()

            # Output of this step becomes input for next step
            self.inputs['Param'] = param_out
            self.inputs['Moment1'] = moment1_out
            self.inputs['Moment2'] = moment2_out
188 189

            # Update powers of Beta1 and Beta2 for next time step
A
Aurelius84 已提交
190 191
            self.inputs['Beta1Pow'] = beta1_pow_out
            self.inputs['Beta2Pow'] = beta2_pow_out
192 193 194 195 196

            # Randomize gradient for next step
            self.inputs['Grad'] = np.random.uniform(
                -1, 1, (102, 105)).astype("float32")

C
chentianyu03 已提交
197 198 199 200
    def test_api_eager_dygraph(self):
        with _test_eager_guard():
            self.test_check_output()

201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219

def adam_step(inputs, attributes):
    '''
    Simulate one step of the adam optimizer
    :param inputs: dict of inputs
    :param attributes: dict of attributes
    :return tuple: tuple of output param, moment1, moment2,
    beta1 power accumulator and beta2 power accumulator
    '''
    param = inputs['Param']
    grad = inputs['Grad']
    moment1 = inputs['Moment1']
    moment2 = inputs['Moment2']
    lr = inputs['LearningRate']
    beta1_pow = inputs['Beta1Pow']
    beta2_pow = inputs['Beta2Pow']

    epsilon = attributes['epsilon']

220 221 222 223 224 225 226 227 228
    if 'beta1' in attributes:
        beta1 = attributes['beta1']
    else:
        beta1 = inputs['Beta1Tensor'][0]
    if 'beta2' in attributes:
        beta2 = attributes['beta2']
    else:
        beta2 = inputs['Beta2Tensor'][0]

229 230
    moment1_out = beta1 * moment1 + (1 - beta1) * grad
    moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
231
    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
232
    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))
233
    return param_out, moment1_out, moment2_out
234 235


R
Roc 已提交
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
def adamw_step(inputs, attributes):
    '''
    Simulate one step of the adam optimizer
    :param inputs: dict of inputs
    :param attributes: dict of attributes
    :return tuple: tuple of output param, moment1, moment2,
    beta1 power accumulator and beta2 power accumulator
    '''
    param = inputs['Param']
    grad = inputs['Grad']
    moment1 = inputs['Moment1']
    moment2 = inputs['Moment2']
    lr = inputs['LearningRate']
    beta1_pow = inputs['Beta1Pow']
    beta2_pow = inputs['Beta2Pow']

    epsilon = attributes['epsilon']
    coeff = attributes["coeff"]
    if attributes.get("with_decay", False):
        decay = 1.0 - lr * coeff
        param2 = param * decay
        param = param2.copy()
    if 'beta1' in attributes:
        beta1 = attributes['beta1']
    else:
        beta1 = inputs['Beta1Tensor'][0]
    if 'beta2' in attributes:
        beta2 = attributes['beta2']
    else:
        beta2 = inputs['Beta2Tensor'][0]

    moment1_out = beta1 * moment1 + (1 - beta1) * grad
    moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))

    return param_out, moment1_out, moment2_out


Q
Qiao Longfei 已提交
275
def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
Q
Qiao Longfei 已提交
276
                     lazy_mode):
T
wip  
typhoonzero 已提交
277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
    '''
    Simulate one step of the adam optimizer
    :param inputs: dict of inputs
    :param attributes: dict of attributes
    :return tuple: tuple of output param, moment1, moment2,
    beta1 power accumulator and beta2 power accumulator
    '''
    param = inputs['Param']
    # grad = inputs['Grad']
    moment1 = inputs['Moment1']
    moment2 = inputs['Moment2']
    lr = inputs['LearningRate']
    beta1_pow = inputs['Beta1Pow']
    beta2_pow = inputs['Beta2Pow']

    beta1 = attributes['beta1']
    beta2 = attributes['beta2']
    epsilon = attributes['epsilon']

T
typhoonzero 已提交
296 297 298
    moment1_out = np.zeros(shape=[height, row_numel])
    moment2_out = np.zeros(shape=[height, row_numel])
    param_out = np.zeros(shape=[height, row_numel])
T
wip  
typhoonzero 已提交
299

Q
Qiao Longfei 已提交
300
    def update_row(row_id, update_value):
301 302
        moment1_out[row_id] = beta1 * moment1[row_id] + (1 -
                                                         beta1) * update_value
T
wip  
typhoonzero 已提交
303
        moment2_out[row_id] = beta2 * moment2[row_id] + (
Q
Qiao Longfei 已提交
304
            1 - beta2) * np.square(update_value)
T
wip  
typhoonzero 已提交
305
        lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
306 307
        param_out[row_id] = param[row_id] - lr_t * (
            moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon))
Q
Qiao Longfei 已提交
308 309 310 311 312 313 314 315 316 317 318

    if lazy_mode:
        for idx, row_id in enumerate(rows):
            update_row(row_id, np_grad[idx])
    else:
        for row_id in range(param_out.shape[0]):
            update_value = np.zeros(np_grad[0].shape).astype("float32")
            if row_id in rows:
                update_value = np_grad[rows.index(row_id)]
            update_row(row_id, update_value)

T
wip  
typhoonzero 已提交
319 320 321 322
    return param_out, moment1_out, moment2_out


class TestSparseAdamOp(unittest.TestCase):
323

Q
Qiao Longfei 已提交
324
    def setup(self, scope, place, lazy_mode):
T
wip  
typhoonzero 已提交
325 326 327
        beta1 = 0.78
        beta2 = 0.836
        epsilon = 1e-4
A
Aurelius84 已提交
328 329
        beta1_pow = np.array([beta1**10]).astype("float32")
        beta2_pow = np.array([beta2**10]).astype("float32")
T
wip  
typhoonzero 已提交
330 331 332

        height = 10
        rows = [0, 4, 7]
T
typhoonzero 已提交
333
        self.rows = rows
T
wip  
typhoonzero 已提交
334
        row_numel = 12
T
typhoonzero 已提交
335
        self.row_numel = row_numel
T
wip  
typhoonzero 已提交
336
        self.dense_inputs = {
Q
Qiao Longfei 已提交
337 338 339
            "Param": np.full((height, row_numel), 5.0).astype("float32"),
            "Moment1": np.full((height, row_numel), 5.0).astype("float32"),
            "Moment2": np.full((height, row_numel), 5.0).astype("float32"),
A
Aurelius84 已提交
340 341
            'Beta1Pow': beta1_pow,
            'Beta2Pow': beta2_pow,
T
wip  
typhoonzero 已提交
342 343
            "LearningRate": np.full((1), 2.0).astype("float32")
        }
Q
Qiao Longfei 已提交
344
        self.init_output = np.full((height, row_numel), 0.0).astype("float32")
345 346 347 348 349 350
        self.attrs = {
            'epsilon': epsilon,
            'beta1': beta1,
            'beta2': beta2,
            'min_row_size_to_use_multithread': 2
        }
T
wip  
typhoonzero 已提交
351 352 353 354 355 356 357 358 359 360 361 362 363

        grad_selected_rows = scope.var('Grad').get_selected_rows()
        grad_selected_rows.set_height(height)
        grad_selected_rows.set_rows(rows)
        np_array = np.ones((len(rows), row_numel)).astype("float32")
        np_array[0, 0] = 2.0
        np_array[2, 8] = 4.0

        grad_tensor = grad_selected_rows.get_tensor()
        grad_tensor.set(np_array, place)

        self.sparse_inputs = ["Grad"]

Q
Qiao Longfei 已提交
364 365
        param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
                                                 height, rows, row_numel,
Q
Qiao Longfei 已提交
366
                                                 np_array, lazy_mode)
T
wip  
typhoonzero 已提交
367
        self.outputs = {
T
typhoonzero 已提交
368
            "ParamOut": param_out,
T
wip  
typhoonzero 已提交
369
            "Moment1Out": mom1,
A
Aurelius84 已提交
370 371 372
            "Moment2Out": mom2,
            'Beta1PowOut': beta1_pow * beta1,
            'Beta2PowOut': beta2_pow * beta2
T
wip  
typhoonzero 已提交
373 374
        }

Q
Qiao Longfei 已提交
375
    def check_with_place(self, place, lazy_mode):
T
wip  
typhoonzero 已提交
376
        scope = core.Scope()
Q
Qiao Longfei 已提交
377
        self.setup(scope, place, lazy_mode)
T
wip  
typhoonzero 已提交
378 379

        op_args = dict()
Q
Qiao Longfei 已提交
380
        op_args['lazy_mode'] = lazy_mode
381
        for key, np_array in self.dense_inputs.items():
T
wip  
typhoonzero 已提交
382 383 384 385 386
            var = scope.var(key).get_tensor()
            var.set(np_array, place)
            op_args[key] = key
        for s in self.sparse_inputs:
            op_args[s] = s
T
typhoonzero 已提交
387 388
        for s in self.outputs:
            var = scope.var(s).get_tensor()
Q
Qiao Longfei 已提交
389
            var.set(self.init_output, place)
T
typhoonzero 已提交
390
            op_args[s] = s
T
wip  
typhoonzero 已提交
391 392 393 394
        for k in self.attrs:
            op_args[k] = self.attrs[k]

        # create and run sgd operator
T
typhoonzero 已提交
395 396
        adam_op = Operator("adam", **op_args)
        adam_op.run(scope, place)
T
wip  
typhoonzero 已提交
397

398
        for key, np_array in self.outputs.items():
T
wip  
typhoonzero 已提交
399 400
            out_var = scope.var(key).get_tensor()
            actual = np.array(out_var)
T
typhoonzero 已提交
401 402
            actual = actual.reshape([actual.size])
            np_array = np_array.reshape([np_array.size])
Q
Qiao Longfei 已提交
403 404 405

            for i in range(np_array.size):
                self.assertLess((actual[i] - np_array[i]), 0.00001)
T
wip  
typhoonzero 已提交
406

Q
Qiao Longfei 已提交
407
    def test_sparse_adam(self):
T
wip  
typhoonzero 已提交
408
        places = [core.CPUPlace()]
409
        if core.is_compiled_with_cuda():
T
wip  
typhoonzero 已提交
410 411
            places.append(core.CUDAPlace(0))
        for place in places:
Q
Qiao Longfei 已提交
412 413
            for lazy_mode in (True, False):
                self.check_with_place(place, lazy_mode)
T
wip  
typhoonzero 已提交
414 415


416
class TestAdamOpBetaVariable(OpTest):
417

418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454
    def setUp(self):
        '''Test Adam Op with beta as Variable
        '''
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")
        beta1 = 0.85
        beta2 = 0.95

        learning_rate = 0.001
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
            "Beta1Tensor": np.array([beta1]).astype("float32"),
            "Beta2Tensor": np.array([beta2]).astype("float32"),
        }

        attributes = {'epsilon': epsilon}

        param_out, moment1_out, \
            moment2_out = adam_step(self.inputs, attributes)

        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
A
Aurelius84 已提交
455 456 457
            'ParamOut': param_out,
            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
458 459 460 461 462 463
        }

    def test_check_output(self):
        self.check_output()


464
class TestAdamOpBetaEpsilonVariable(OpTest):
465

466
    def setUp(self):
467
        '''Test Adam Op with beta/epsilon as Variable
468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512
        '''
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")
        beta1 = 0.85
        beta2 = 0.95

        learning_rate = 0.001
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
            "Beta1Tensor": np.array([beta1]).astype("float32"),
            "Beta2Tensor": np.array([beta2]).astype("float32"),
            "EpsilonTensor": np.array([epsilon]).astype("float32"),
        }

        attributes = {'epsilon': epsilon}

        param_out, moment1_out, \
            moment2_out = adam_step(self.inputs, attributes)

        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
            'ParamOut': param_out,
            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
        }

    def test_check_output(self):
        self.check_output()


513
class TestAdamOpWithGlobalBetaPow(OpTest):
514

515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
    def setUp(self):
        '''Test Adam Op with global_beta_pow
        '''
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")
        beta1 = 0.85
        beta2 = 0.95

        learning_rate = 0.001
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
            "Beta1Tensor": np.array([beta1]).astype("float32"),
            "Beta2Tensor": np.array([beta2]).astype("float32"),
            "EpsilonTensor": np.array([epsilon]).astype("float32"),
        }

        attributes = {'epsilon': epsilon}

        param_out, moment1_out, \
            moment2_out = adam_step(self.inputs, attributes)

        self.attrs = {'use_global_beta_pow': True}

        # use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
        self.outputs = {
            'Moment1Out': moment1_out,
            'Moment2Out': moment2_out,
            'ParamOut': param_out,
            'Beta1PowOut': np.array([]),
            'Beta2PowOut': np.array([])
        }

    def test_check_output(self):
        self.check_output()


565
class TestAdamOpWithSkipUpdate(OpTest):
566

567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
    def setUp(self):
        '''Test Adam Op with global_beta_pow
        '''
        self.op_type = "adam"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        # The second moment is positive
        moment2 = np.random.random((102, 105)).astype("float32")
        beta1 = 0.85
        beta2 = 0.95

        learning_rate = 0.001
        epsilon = 1e-8
        beta1_pow = beta1**10
        beta2_pow = beta2**10

        self.inputs = {
            'Param': param,
            'Grad': grad,
            'Moment1': moment1,
            'Moment2': moment2,
            'LearningRate': np.array([learning_rate]).astype("float32"),
            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
            "Beta1Tensor": np.array([beta1]).astype("float32"),
            "Beta2Tensor": np.array([beta2]).astype("float32"),
            "EpsilonTensor": np.array([epsilon]).astype("float32"),
            "SkipUpdate": np.array([True]).astype("bool"),
        }

        attributes = {'epsilon': epsilon}

        self.attrs = {'use_global_beta_pow': True}

        # use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
        self.outputs = {
            'Moment1Out': moment1,
            'Moment2Out': moment2,
            'ParamOut': param,
            'Beta1PowOut': self.inputs['Beta1Pow'],
            'Beta2PowOut': self.inputs['Beta2Pow'],
        }

    def test_check_output(self):
        self.check_output()


M
MRXLT 已提交
615
class TestAdamOpV2(unittest.TestCase):
616

M
MRXLT 已提交
617 618
    def test_adam_op(self):
        place = fluid.CPUPlace()
619
        shape = [2, 3, 8, 8]
M
MRXLT 已提交
620 621 622 623 624 625 626 627 628
        exe = fluid.Executor(place)
        train_prog = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(train_prog, startup):
            with fluid.unique_name.guard():
                data = fluid.data(name="data", shape=shape)
                conv = fluid.layers.conv2d(data, 8, 3)
                loss = fluid.layers.reduce_mean(conv)

629 630 631 632 633 634 635 636
                beta1 = fluid.layers.create_global_var(shape=[1],
                                                       value=0.85,
                                                       dtype='float32',
                                                       persistable=True)
                beta2 = fluid.layers.create_global_var(shape=[1],
                                                       value=0.95,
                                                       dtype='float32',
                                                       persistable=True)
M
MRXLT 已提交
637
                betas = [beta1, beta2]
638 639 640 641 642
                opt = paddle.optimizer.Adam(learning_rate=1e-5,
                                            beta1=beta1,
                                            beta2=beta2,
                                            weight_decay=0.01,
                                            epsilon=1e-8)
M
MRXLT 已提交
643 644 645 646 647 648 649 650 651 652 653 654 655
                opt.minimize(loss)

        exe.run(startup)
        data_np = np.random.random(shape).astype('float32')
        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
        assert rets[0] is not None

    def test_adam_op_dygraph(self):
        paddle.disable_static()
        value = np.arange(26).reshape(2, 13).astype("float32")
        a = fluid.dygraph.to_variable(value)
        linear = fluid.Linear(13, 5, dtype="float32")

656 657
        adam = paddle.optimizer.Adam(learning_rate=0.01,
                                     parameters=linear.parameters())
M
MRXLT 已提交
658 659 660 661
        out = linear(a)
        out.backward()
        adam.step()
        adam.clear_gradients()
662
        paddle.enable_static()
M
MRXLT 已提交
663 664 665 666

    def test_adam_op_with_state_dict(self):

        paddle.disable_static()
T
tangwei12 已提交
667
        emb = paddle.nn.Embedding(10, 10)
M
MRXLT 已提交
668 669 670 671 672

        adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
        state_dict = adam.state_dict()
        adam.set_state_dict(state_dict)

673 674
        #learning_rate is LRScheduler
        learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(
675
            learning_rate=0.1, T_max=10)
M
MRXLT 已提交
676 677 678 679 680 681 682 683 684 685 686 687
        adam = paddle.optimizer.Adam(
            learning_rate=learning_rate,
            weight_decay=fluid.regularizer.L2Decay(0.001),
            parameters=emb.parameters())
        lr = adam.get_lr()
        state_dict = adam.state_dict()
        adam.set_state_dict(state_dict)

        #leanrning_rate is Tensor
        with self.assertRaises(TypeError):
            learning_rate = np.array([0.01]).astype("float32")
            learning_rate = paddle.to_tensor(learning_rate)
688 689
            adam = paddle.optimizer.Adam(learning_rate=learning_rate,
                                         parameters=emb.parameters())
M
MRXLT 已提交
690 691 692

        params = adam.get_opti_var_name_list()
        assert (params is not None)
693
        paddle.enable_static()
M
MRXLT 已提交
694 695 696 697 698 699 700

    def test_adam_with_grad_clip(self):
        paddle.disable_static()
        value = np.arange(26).reshape(2, 13).astype("float32")
        a = fluid.dygraph.to_variable(value)
        linear = fluid.Linear(13, 5, dtype="float32")
        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
701 702 703
        adam = paddle.optimizer.Adam(0.1,
                                     parameters=linear.parameters(),
                                     grad_clip=clip)
M
MRXLT 已提交
704 705 706 707
        out = linear(a)
        out.backward()
        adam.step()
        adam.clear_gradients()
708
        paddle.enable_static()
M
MRXLT 已提交
709 710 711 712 713 714 715 716 717 718 719

    def test_adam_op_with_set_lr(self):
        paddle.disable_static()
        linear = paddle.nn.Linear(10, 10)
        adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())

        lr = 0.01
        adam.set_lr(lr)
        cur_lr = adam.get_lr()
        assert (lr == cur_lr)
        with self.assertRaises(TypeError):
720 721 722
            lr_var = paddle.fluid.layers.create_global_var(shape=[1],
                                                           value=lr,
                                                           dtype='float32')
723
            adam.set_lr(lr_var)
724
        paddle.enable_static()
725

M
MRXLT 已提交
726 727 728 729
    def test_adam_op_invalid_input(self):
        paddle.disable_static()
        linear = paddle.nn.Linear(10, 10)
        with self.assertRaises(ValueError):
730 731 732
            adam = paddle.optimizer.Adam(0.1,
                                         beta1=-1,
                                         parameters=linear.parameters())
M
MRXLT 已提交
733
        with self.assertRaises(ValueError):
734 735 736
            adam = paddle.optimizer.Adam(0.1,
                                         beta2=-1,
                                         parameters=linear.parameters())
M
MRXLT 已提交
737
        with self.assertRaises(ValueError):
738 739 740
            adam = paddle.optimizer.Adam(0.1,
                                         epsilon=-1,
                                         parameters=linear.parameters())
741
        paddle.enable_static()
M
MRXLT 已提交
742

743 744 745 746 747 748
    def test_adam_op_with_sparse_input_and_weight_decay(self):

        paddle.disable_static()
        x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64)
        x = paddle.to_tensor(x_data, stop_gradient=False)
        emb = paddle.nn.Embedding(10, 10, sparse=True)
749 750 751
        adam = paddle.optimizer.Adam(0.001,
                                     parameters=emb.parameters(),
                                     weight_decay=0.01)
752 753 754 755 756

        with self.assertRaises(RuntimeError):
            out = emb(x)
            out.backward()
            adam.step()
757
        paddle.enable_static()
758

C
chentianyu03 已提交
759 760 761 762 763 764 765 766
    def test_api_eager_dygraph(self):
        with _test_eager_guard():
            self.test_adam_op_dygraph()
            self.test_adam_op_with_state_dict()
            self.test_adam_with_grad_clip()
            self.test_adam_op_with_set_lr()
            self.test_adam_op_with_sparse_input_and_weight_decay()

767

768
class TestAdamOptimizer(unittest.TestCase):
769

770 771 772 773
    def _test(self,
              place,
              use_tensor=True,
              use_fluid_api=True,
774 775
              use_global_beta_pow=False,
              flatten_param_grads=False):
776 777 778 779 780 781 782
        paddle.enable_static()
        main_prog = paddle.static.Program()
        startup_prog = paddle.static.Program()
        SEED = 2021
        paddle.seed(SEED)
        np.random.seed(SEED)

783 784 785 786 787 788 789 790 791 792 793 794
        a_np = np.random.random(size=(2, 2)).astype('float32')
        b_np = np.random.random(size=(2, 2)).astype('float32')
        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
        weight_attr1 = paddle.ParamAttr(
            name="weight1",
            initializer=fluid.initializer.Constant(value=1.0),
            trainable=True)
        weight_attr2 = paddle.ParamAttr(
            name="weight2",
            initializer=fluid.initializer.Constant(value=2.0),
            trainable=True)
        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
795 796

        with paddle.static.program_guard(main_prog, startup_prog):
797 798 799
            with paddle.utils.unique_name.guard():
                a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
                b = paddle.static.data(name="b", shape=[2, 2], dtype='float32')
800 801 802
                label = paddle.static.data(name="label",
                                           shape=[2, 1],
                                           dtype='int64')
803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847

                sum = paddle.add(a, b)
                z = paddle.pow(sum, 2.0)

                fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1)
                prediction = fluid.layers.fc(input=fc_1,
                                             size=2,
                                             param_attr=weight_attr2,
                                             act='softmax')

                cost = fluid.layers.cross_entropy(input=prediction, label=label)
                loss = fluid.layers.reduce_mean(cost)
                beta1_init = 0.9
                beta2_init = 0.999
                epsilon_init = 1e-8
                if use_tensor:
                    beta1 = fluid.layers.create_global_var(
                        shape=[1],
                        value=float(beta1_init),
                        dtype='float32',
                        persistable=True,
                        name="beta1")
                    beta2 = fluid.layers.create_global_var(
                        shape=[1],
                        value=float(beta2_init),
                        dtype='float32',
                        persistable=True,
                        name="beta2")
                    epsilon = fluid.layers.create_global_var(
                        shape=[1],
                        value=float(epsilon_init),
                        dtype='float32',
                        persistable=True,
                        name="epsilon")
                    if use_fluid_api:
                        adam = fluid.optimizer.Adam(
                            learning_rate=0.01,
                            beta1=beta1,
                            beta2=beta2,
                            epsilon=epsilon,
                            use_global_beta_pow=use_global_beta_pow,
                            flatten_param_grads=flatten_param_grads,
                            align_size=256,
                            grad_clip=clip)
                    else:
848 849 850 851 852
                        adam = paddle.optimizer.Adam(learning_rate=0.01,
                                                     beta1=beta1,
                                                     beta2=beta2,
                                                     epsilon=epsilon,
                                                     grad_clip=clip)
853
                else:
854 855 856 857 858 859 860 861 862 863 864
                    if use_fluid_api:
                        adam = fluid.optimizer.Adam(
                            learning_rate=0.01,
                            beta1=beta1_init,
                            beta2=beta2_init,
                            epsilon=epsilon_init,
                            use_global_beta_pow=use_global_beta_pow,
                            flatten_param_grads=flatten_param_grads,
                            align_size=256,
                            grad_clip=clip)
                    else:
865 866 867 868 869
                        adam = fluid.optimizer.Adam(learning_rate=0.01,
                                                    beta1=beta1_init,
                                                    beta2=beta2_init,
                                                    epsilon=epsilon_init,
                                                    grad_clip=clip)
870 871 872 873 874 875 876 877 878 879

                adam.minimize(loss)

        scope = fluid.Scope()
        with fluid.scope_guard(scope):
            exe = paddle.static.Executor(place)
            exe.run(startup_prog)

            print("Start run on {}".format(place))
            for epoch in range(10):
880 881 882 883 884 885 886
                pred_res, loss_res = exe.run(main_prog,
                                             feed={
                                                 "a": a_np,
                                                 "b": b_np,
                                                 "label": label_np
                                             },
                                             fetch_list=[prediction, loss])
887 888 889 890
                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                    epoch, pred_res[0], loss_res))
            paddle.disable_static()
            return pred_res, loss_res
891 892 893 894 895 896 897

    def _test_with_place(self, place):
        preds = []
        losses = []

        for use_tensor in [True, False]:
            for use_fluid_api in [True, False]:
898
                for use_global_beta_pow in [True, False]:
899
                    for flatten_param_grads in [True, False]:
900 901 902 903
                        pred, loss = self._test(place, use_tensor,
                                                use_fluid_api,
                                                use_global_beta_pow,
                                                flatten_param_grads)
904 905
                        preds.append(pred)
                        losses.append(loss)
906
        for pred in preds:
907
            np.testing.assert_allclose(pred, preds[0], rtol=1e-05)
908
        for loss in losses:
909
            np.testing.assert_allclose(loss, losses[0], rtol=1e-05)
910 911 912 913 914 915 916

    def test_adam_api(self):
        # NOTE(zhiqiu): cpu and gpu has different seed, so should compare separatly.
        self._test_with_place(paddle.CPUPlace())
        if core.is_compiled_with_cuda():
            self._test_with_place(paddle.CUDAPlace(0))

917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934
    def test_adam_flatten_param_grads_with_regularizer(self):
        # flatten_param_grads + regularizer is not supported yet.
        paddle.enable_static()
        main = fluid.Program()
        weight_attr = paddle.ParamAttr(
            name="weight1",
            initializer=fluid.initializer.Constant(value=1.0),
            regularizer=fluid.regularizer.L1DecayRegularizer(
                regularization_coeff=0.1),
            trainable=True)
        with fluid.program_guard(main):
            x = fluid.data(name='x', shape=[None, 13], dtype='float32')
            y = fluid.data(name='y', shape=[None, 1], dtype='float32')
            y_predict = fluid.layers.fc(input=x,
                                        size=1,
                                        act=None,
                                        param_attr=weight_attr)
            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
935
            avg_cost = paddle.mean(cost)
936

937 938 939
            adam = fluid.optimizer.AdamOptimizer(0.01,
                                                 flatten_param_grads=True,
                                                 align_size=256)
940 941 942 943 944
            adam.minimize(avg_cost)
            paddle.disable_static()

            self.assertEqual(adam._flatten_param_grads, False)

945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961
    def test_adam_exception(self):
        paddle.enable_static()
        a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
        b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
        label = paddle.static.data(name="label", shape=[32, 1], dtype='int64')

        sum = paddle.add(a, b)
        z = paddle.pow(sum, 2.0)

        fc_1 = fluid.layers.fc(input=z, size=128)
        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')

        cost = fluid.layers.cross_entropy(input=prediction, label=label)
        loss = fluid.layers.reduce_mean(cost)
        adam = fluid.optimizer.Adam(use_global_beta_pow=True)
        adam.minimize(loss)
        self.assertRaises(Exception, adam._get_global_accumulator, 'tmp')
962 963
        adam._add_global_accumulator('tmp',
                                     type=core.VarDesc.VarType.LOD_TENSOR)
964
        adam._get_global_accumulator('tmp')
965 966 967 968
        self.assertRaises(Exception,
                          adam._add_global_accumulator,
                          adam._beta1_pow_acc_str,
                          type=core.VarDesc.VarType.LOD_TENSOR)
969 970 971 972 973 974 975 976 977 978
        paddle.disable_static()

    def test_adam_save_load(self):
        paddle.disable_static()
        a = paddle.rand([4, 10])
        linear = paddle.nn.Linear(10, 10)
        b = linear(a)
        state_dict = linear.state_dict()
        fluid.save_dygraph(state_dict, "paddle_dy")

979 980 981 982 983 984
        scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01,
                                                  warmup_steps=100,
                                                  verbose=True)
        adam = paddle.fluid.optimizer.Adam(learning_rate=scheduler,
                                           parameter_list=linear.parameters(),
                                           use_global_beta_pow=True)
985 986 987
        adam.minimize(b)
        state_dict = adam.state_dict()
        fluid.save_dygraph(state_dict, "paddle_dy")
988 989
        para_state_dict, opt_state_dict = fluid.load_dygraph("paddle_dy")
        adam.set_state_dict(opt_state_dict)
990 991 992

        paddle.enable_static()

993 994 995 996 997 998 999 1000 1001 1002 1003 1004
    def test_adam_save_load_error(self):
        paddle.disable_static()

        def get_opt(dtype, shape):
            with paddle.utils.unique_name.guard():
                paddle.set_default_dtype(dtype)
                a = paddle.rand([4, 10])
                linear = paddle.nn.Linear(10, 10)
                b = linear(a)
                state_dict = linear.state_dict()
                fluid.save_dygraph(state_dict, "paddle_dy")

1005 1006 1007
                scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01,
                                                          warmup_steps=100,
                                                          verbose=True)
1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025
                adam = paddle.fluid.optimizer.Adam(
                    learning_rate=scheduler,
                    parameter_list=linear.parameters(),
                    use_global_beta_pow=True)
                adam.minimize(b)
                return adam

        adam = get_opt('float32', [10, 10])

        state_dict = adam.state_dict()
        fluid.save_dygraph(state_dict, "paddle_dy")
        para_state_dict, opt_state_dict = fluid.load_dygraph("paddle_dy")
        adam.set_state_dict(opt_state_dict)

        adam2 = get_opt('float64', [10, 10])  # dtype not match
        self.assertRaises(AssertionError, adam2.set_state_dict, opt_state_dict)

        adam3 = get_opt('float32', [10, 10])  # shape not match
1026 1027
        opt_state_dict['beta1_pow_acc_0'] = np.array([0.9, 0.9],
                                                     dtype='float32')
1028 1029 1030
        self.assertRaises(AssertionError, adam3.set_state_dict, opt_state_dict)
        paddle.enable_static()

1031

1032
class TestAdamOpV2Group(TestAdamOpV2):
1033

1034 1035 1036 1037 1038 1039 1040
    def test_adam_op(self):
        paddle.disable_static()
        value = np.arange(26).reshape(2, 13).astype("float32")
        a = paddle.to_tensor(value)
        linear_1 = paddle.nn.Linear(13, 5)
        linear_2 = paddle.nn.Linear(5, 3)
        # This can be any optimizer supported by dygraph.
1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
        adam = paddle.optimizer.Adam(learning_rate=0.01,
                                     parameters=[{
                                         'params': linear_1.parameters()
                                     }, {
                                         'params': linear_2.parameters(),
                                         'weight_decay': 0.001,
                                         'beta1': 0.1,
                                         'beta2': 0.99
                                     }],
                                     weight_decay=0.1)
1051 1052 1053 1054 1055 1056 1057
        out = linear_1(a)
        out = linear_2(out)
        out.backward()
        adam.step()
        adam.clear_gradients()


Z
zhangbo9674 已提交
1058
class TestMultiTensorAdam(unittest.TestCase):
1059

Z
zhangbo9674 已提交
1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081
    def _adam_optimize_dygraph(self,
                               place,
                               use_param_attr=False,
                               use_param_group=False,
                               use_amp=False,
                               use_multi_tensor=False):
        paddle.disable_static()
        paddle.seed(10)
        paddle.set_device(place)

        input = paddle.randn((5, 5))

        weight_attr = paddle.ParamAttr(
            learning_rate=0.5,
            regularizer=paddle.regularizer.L2Decay(1.0),
            trainable=True)
        if use_param_attr:
            model = paddle.nn.Linear(5, 5, weight_attr)
        else:
            model = paddle.nn.Linear(5, 5)

        if not use_param_group:
1082 1083 1084
            optimizer = paddle.optimizer.Adam(parameters=model.parameters(),
                                              use_multi_tensor=use_multi_tensor,
                                              multi_precision=use_amp)
Z
zhangbo9674 已提交
1085
        else:
1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097
            optimizer = paddle.optimizer.Adam(parameters=[{
                'params':
                model.parameters(),
                'weight_decay':
                0.001,
                'beta1':
                0.1,
                'beta2':
                0.99
            }],
                                              use_multi_tensor=use_multi_tensor,
                                              multi_precision=use_amp)
Z
zhangbo9674 已提交
1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132

        for idx in range(2):
            if place == 'gpu' and use_amp == True:
                model = paddle.amp.decorate(models=model, level='O2')
                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)

            if place == 'gpu' and use_amp == True:
                with paddle.amp.auto_cast(level='O2'):
                    output = model(input)
                    loss = paddle.mean(output)
                scaled = scaler.scale(loss)
                scaled.backward()
                scaler.step(optimizer)
                optimizer.clear_grad()
            else:
                output = model(input)
                loss = paddle.mean(output)
                loss.backward()
                optimizer.step()
                optimizer.clear_grad()

        return output, model.parameters()

    def _adam_optimize_static(self,
                              place,
                              use_amp=False,
                              use_multi_tensor=False):
        paddle.enable_static()
        paddle.seed(10)
        np.random.seed(10)
        if place == 'cpu':
            use_amp = False
        exe = paddle.static.Executor(place=place)
        train_program = paddle.static.Program()
        startup_program = paddle.static.Program()
1133 1134
        optimizer = paddle.optimizer.Adam(multi_precision=use_amp,
                                          use_multi_tensor=use_multi_tensor)
Z
zhangbo9674 已提交
1135 1136 1137 1138 1139 1140 1141 1142 1143
        if use_amp:
            optimizer = paddle.static.amp.decorate(
                optimizer,
                init_loss_scaling=128.0,
                use_dynamic_loss_scaling=True,
                use_pure_fp16=True,
                use_fp16_guard=False)
        with paddle.static.program_guard(train_program, startup_program):
            if use_amp:
1144 1145 1146
                data = paddle.static.data(shape=[2, 2],
                                          name='X',
                                          dtype='float16')
Z
zhangbo9674 已提交
1147
            else:
1148 1149 1150
                data = paddle.static.data(shape=[2, 2],
                                          name='X',
                                          dtype='float32')
Z
zhangbo9674 已提交
1151
            hidden = paddle.static.nn.fc(x=data, size=10)
1152
            loss = paddle.mean(hidden)
Z
zhangbo9674 已提交
1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
            optimizer.minimize(loss)
        exe.run(startup_program)
        if use_amp:
            optimizer.amp_init(place=place, scope=paddle.static.global_scope())
            x = np.random.random(size=(2, 2)).astype('float16')
        else:
            x = np.random.random(size=(2, 2)).astype('float32')
        out = []
        for idx in range(5):
            loss_data, = exe.run(train_program,
                                 feed={"X": x},
                                 fetch_list=[loss.name])
            out.append(loss_data)
        return out

    def _get_places(self):
        places = ['cpu']
        if paddle.is_compiled_with_cuda():
            places.append('gpu')
        return places

    def _check_with_place_amp(self, place, use_amp):
        # test dygraph mode
        output_dygraph1, params_dygraph1 = self._adam_optimize_dygraph(
            place=place, use_amp=use_amp, use_multi_tensor=True)
        output_dygraph2, params_dygraph2 = self._adam_optimize_dygraph(
            place=place, use_amp=use_amp, use_multi_tensor=False)
1180
        np.testing.assert_allclose(output_dygraph1, output_dygraph2, rtol=1e-05)
Z
zhangbo9674 已提交
1181
        for idx in range(len(params_dygraph1)):
1182 1183 1184
            np.testing.assert_allclose(params_dygraph1[idx],
                                       params_dygraph2[idx],
                                       rtol=1e-05)
Z
zhangbo9674 已提交
1185
        # test static mode
1186 1187 1188 1189 1190 1191
        output_static1 = self._adam_optimize_static(place=place,
                                                    use_amp=use_amp,
                                                    use_multi_tensor=True)
        output_static2 = self._adam_optimize_static(place=place,
                                                    use_amp=use_amp,
                                                    use_multi_tensor=False)
Z
zhangbo9674 已提交
1192
        for idx in range(len(output_static1)):
1193 1194 1195
            np.testing.assert_allclose(output_static1[idx],
                                       output_static2[idx],
                                       rtol=1e-05)
Z
zhangbo9674 已提交
1196 1197

    def _check_with_param_arrt(self, place, use_amp):
1198 1199 1200 1201 1202 1203 1204 1205
        output1, params1 = self._adam_optimize_dygraph(place=place,
                                                       use_amp=use_amp,
                                                       use_param_attr=True,
                                                       use_multi_tensor=True)
        output2, params2 = self._adam_optimize_dygraph(place=place,
                                                       use_amp=use_amp,
                                                       use_param_attr=True,
                                                       use_multi_tensor=False)
Z
zhangbo9674 已提交
1206

1207
        np.testing.assert_allclose(output1, output2, rtol=1e-05)
Z
zhangbo9674 已提交
1208
        for idx in range(len(params1)):
1209
            np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
Z
zhangbo9674 已提交
1210 1211

    def _check_with_param_group(self, place, use_amp):
1212 1213 1214 1215 1216 1217 1218 1219
        output1, params1 = self._adam_optimize_dygraph(place=place,
                                                       use_amp=use_amp,
                                                       use_param_group=True,
                                                       use_multi_tensor=True)
        output2, params2 = self._adam_optimize_dygraph(place=place,
                                                       use_amp=use_amp,
                                                       use_param_group=True,
                                                       use_multi_tensor=False)
Z
zhangbo9674 已提交
1220

1221
        np.testing.assert_allclose(output1, output2, rtol=1e-05)
Z
zhangbo9674 已提交
1222
        for idx in range(len(params1)):
1223
            np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
Z
zhangbo9674 已提交
1224 1225 1226 1227 1228 1229 1230 1231 1232 1233

    def test_main(self):
        for place in self._get_places():
            use_amp_list = [True, False]
            for use_amp in use_amp_list:
                self._check_with_place_amp(place, use_amp)
                self._check_with_param_arrt(place, use_amp)
                self._check_with_param_group(place, use_amp)


1234
if __name__ == "__main__":
H
hong 已提交
1235
    paddle.enable_static()
1236
    unittest.main()