test_eager_deletion_padding_rnn.py 21.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import os
16
import unittest
17

18
import numpy as np
19

L
Leo Chen 已提交
20
import paddle
21 22
from paddle import fluid
from paddle.fluid import layers
23 24 25
from paddle.fluid.executor import Executor
from paddle.fluid.layers.control_flow import StaticRNN as PaddingRNN

26 27
os.environ["CPU_NUM"] = "1"

28

29
class RNNConfig:
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
    def __init__(self, model_type, rnn_model):
        self.model_type = model_type
        self.rnn_model = rnn_model

        self.vocab_size = 10000
        if self.model_type == "test":
            self.num_layers = 1
            self.batch_size = 2
            self.hidden_size = 10
            self.num_steps = 3
            self.init_scale = 0.1
            self.max_grad_norm = 5.0
            self.epoch_start_decay = 1
            self.max_epoch = 1
            self.dropout = 0.0
            self.lr_decay = 0.5
            self.base_learning_rate = 1.0
        elif self.model_type == "small":
            self.num_layers = 2
            self.batch_size = 20
            self.hidden_size = 200
            self.num_steps = 20
            self.init_scale = 0.1
            self.max_grad_norm = 5.0
            self.epoch_start_decay = 4
            self.max_epoch = 13
            self.dropout = 0.0
            self.lr_decay = 0.5
            self.base_learning_rate = 1.0
        elif self.model_type == "medium":
            self.num_layers = 2
            self.batch_size = 20
            self.hidden_size = 650
            self.num_steps = 35
            self.init_scale = 0.05
            self.max_grad_norm = 5.0
            self.epoch_start_decay = 6
            self.max_epoch = 39
            self.dropout = 0.5
            self.lr_decay = 0.8
            self.base_learning_rate = 1.0
        elif self.model_type == "large":
            self.num_layers = 2
            self.batch_size = 20
            self.hidden_size = 1500
            self.num_steps = 35
            self.init_scale = 0.04
            self.max_grad_norm = 10.0
            self.epoch_start_decay = 14
            self.max_epoch = 55
            self.dropout = 0.65
            self.lr_decay = 1.0 / 1.15
            self.base_learning_rate = 1.0
        else:
            raise ValueError('Unsupported model_type.')

86
        if rnn_model not in ('static', 'padding', 'cudnn'):
87 88 89 90 91 92 93 94
            raise ValueError('Unsupported rnn_model.')

        self.batch_size = 12
        self.max_epoch = 3
        self.random_seed = 123


# Fake data reader for test
95
class Reader:
96 97
    def get_data_iter(self, rnn_config):
        for i in range(rnn_config.max_epoch):
98 99 100 101 102 103 104 105
            x = np.zeros(
                shape=(rnn_config.batch_size, rnn_config.num_steps),
                dtype='int64',
            )
            y = np.ones(
                shape=(rnn_config.batch_size, rnn_config.num_steps),
                dtype='int64',
            )
106 107 108 109
            yield (x, y)


# Model from PaddleNLP/models/language_model/lm_model.py in Paddle Models repo
110 111 112 113 114 115 116 117 118 119
def lm_model(
    hidden_size,
    vocab_size,
    batch_size,
    num_layers=2,
    num_steps=20,
    init_scale=0.1,
    dropout=None,
    rnn_model='static',
):
120 121 122 123 124 125 126 127
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
128
            weight_1 = paddle.create_parameter(
129 130 131
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
132
                default_initializer=paddle.nn.initializer.Uniform(
133 134 135
                    low=-init_scale, high=init_scale
                ),
            )
136
            weight_1_arr.append(weight_1)
137
            bias_1 = paddle.create_parameter(
138 139 140
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
141
                default_initializer=paddle.nn.initializer.Constant(0.0),
142
            )
143 144
            bias_arr.append(bias_1)

2
201716010711 已提交
145
            pre_hidden = paddle.slice(
146 147
                init_hidden, axes=[0], starts=[i], ends=[i + 1]
            )
2
201716010711 已提交
148
            pre_cell = paddle.slice(
149 150
                init_cell, axes=[0], starts=[i], ends=[i + 1]
            )
151 152
            pre_hidden = paddle.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = paddle.reshape(pre_cell, shape=[-1, hidden_size])
153 154 155
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

156
        input_embedding = paddle.transpose(input_embedding, perm=[1, 0, 2])
157 158 159 160 161 162 163 164 165 166
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

167
                nn = paddle.concat([input, pre_hidden], 1)
K
kangguangli 已提交
168
                gate_input = paddle.matmul(x=nn, y=weight_1)
169

170
                gate_input = paddle.add(gate_input, bias)
2
201716010711 已提交
171
                i = paddle.slice(
172 173
                    gate_input, axes=[1], starts=[0], ends=[hidden_size]
                )
2
201716010711 已提交
174
                j = paddle.slice(
175 176 177 178 179
                    gate_input,
                    axes=[1],
                    starts=[hidden_size],
                    ends=[hidden_size * 2],
                )
2
201716010711 已提交
180
                f = paddle.slice(
181 182 183 184 185
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 2],
                    ends=[hidden_size * 3],
                )
2
201716010711 已提交
186
                o = paddle.slice(
187 188 189 190 191
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 3],
                    ends=[hidden_size * 4],
                )
192

193 194 195 196
                c = pre_cell * paddle.nn.functional.sigmoid(
                    f
                ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
                m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
197 198 199 200 201 202 203 204 205

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

206
                if dropout is not None and dropout > 0.0:
C
ccrrong 已提交
207
                    input = paddle.nn.functional.dropout(
208
                        input,
C
ccrrong 已提交
209 210
                        p=dropout,
                        mode='upscale_in_train',
211
                    )
212 213 214 215 216 217 218 219 220 221 222 223

            rnn.step_output(input)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
2
201716010711 已提交
224
            last_h = paddle.slice(
225 226
                m, axes=[0], starts=[num_steps - 1], ends=[num_steps]
            )
227
            last_hidden_array.append(last_h)
2
201716010711 已提交
228
            last_c = paddle.slice(
229 230
                c, axes=[0], starts=[num_steps - 1], ends=[num_steps]
            )
231
            last_cell_array.append(last_c)
232
        real_res = paddle.transpose(x=real_res, perm=[1, 0, 2])
233 234
        last_hidden = paddle.concat(last_hidden_array, 0)
        last_cell = paddle.concat(last_cell_array, 0)
235 236 237

        return real_res, last_hidden, last_cell

238 239 240
    def encoder_static(
        input_embedding, len=3, init_hidden=None, init_cell=None
    ):
241 242 243 244 245 246 247
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
248
            weight_1 = paddle.create_parameter(
249 250 251
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
252
                default_initializer=paddle.nn.initializer.Uniform(
253 254 255
                    low=-init_scale, high=init_scale
                ),
            )
256
            weight_1_arr.append(weight_1)
257
            bias_1 = paddle.create_parameter(
258 259 260
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
261
                default_initializer=paddle.nn.initializer.Constant(0.0),
262
            )
263 264
            bias_arr.append(bias_1)

2
201716010711 已提交
265
            pre_hidden = paddle.slice(
266 267
                init_hidden, axes=[0], starts=[i], ends=[i + 1]
            )
2
201716010711 已提交
268
            pre_cell = paddle.slice(
269 270
                init_cell, axes=[0], starts=[i], ends=[i + 1]
            )
271 272
            pre_hidden = paddle.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = paddle.reshape(pre_cell, shape=[-1, hidden_size])
273 274 275 276
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
277 278
        sliced_inputs = paddle.split(
            input_embedding, num_or_sections=len, axis=1
279
        )
280 281 282

        for index in range(len):
            input = sliced_inputs[index]
283
            input = paddle.reshape(input, shape=[-1, hidden_size])
284 285 286 287 288 289
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

290
                nn = paddle.concat([input, pre_hidden], 1)
K
kangguangli 已提交
291
                gate_input = paddle.matmul(x=nn, y=weight_1)
292

293
                gate_input = paddle.add(gate_input, bias)
294 295 296
                i, j, f, o = paddle.split(
                    gate_input, num_or_sections=4, axis=-1
                )
297

298 299 300 301
                c = pre_cell * paddle.nn.functional.sigmoid(
                    f
                ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
                m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
302 303 304 305 306

                hidden_array[k] = m
                cell_array[k] = c
                input = m

307
                if dropout is not None and dropout > 0.0:
C
ccrrong 已提交
308
                    input = paddle.nn.functional.dropout(
309
                        input,
C
ccrrong 已提交
310 311
                        p=dropout,
                        mode='upscale_in_train',
312
                    )
313 314 315

            res.append(input)

316
        last_hidden = paddle.concat(hidden_array, 1)
317 318
        last_hidden = paddle.reshape(
            last_hidden, shape=[-1, num_layers, hidden_size]
319
        )
320
        last_hidden = paddle.transpose(x=last_hidden, perm=[1, 0, 2])
321

322
        last_cell = paddle.concat(cell_array, 1)
323
        last_cell = paddle.reshape(
324 325
            last_cell, shape=[-1, num_layers, hidden_size]
        )
326
        last_cell = paddle.transpose(x=last_cell, perm=[1, 0, 2])
327

328
        real_res = paddle.concat(res, 0)
329
        real_res = paddle.reshape(real_res, shape=[len, -1, hidden_size])
330
        real_res = paddle.transpose(x=real_res, perm=[1, 0, 2])
331 332 333 334

        return real_res, last_hidden, last_cell

    batch_size_each = batch_size
G
GGBond8488 已提交
335 336
    x = paddle.static.data(
        name="x", shape=[batch_size_each, num_steps, 1], dtype='int64'
337
    )
G
GGBond8488 已提交
338 339
    y = paddle.static.data(
        name="y", shape=[batch_size_each * num_steps, 1], dtype='int64'
340 341
    )

G
GGBond8488 已提交
342
    init_hidden = paddle.static.data(
343 344 345 346
        name="init_hidden",
        shape=[num_layers, batch_size_each, hidden_size],
        dtype='float32',
    )
G
GGBond8488 已提交
347
    init_cell = paddle.static.data(
348 349 350 351
        name="init_cell",
        shape=[num_layers, batch_size_each, hidden_size],
        dtype='float32',
    )
352 353 354 355

    init_cell.persistable = True
    init_hidden.persistable = True

356
    init_hidden_reshape = paddle.reshape(
357 358
        init_hidden, shape=[num_layers, -1, hidden_size]
    )
359
    init_cell_reshape = paddle.reshape(
360 361
        init_cell, shape=[num_layers, -1, hidden_size]
    )
362 363 364 365 366 367 368 369

    x_emb = layers.embedding(
        input=x,
        size=[vocab_size, hidden_size],
        dtype='float32',
        is_sparse=False,
        param_attr=fluid.ParamAttr(
            name='embedding_para',
370
            initializer=paddle.nn.initializer.Uniform(
371 372 373 374 375
                low=-init_scale, high=init_scale
            ),
        ),
    )

376
    x_emb = paddle.reshape(x_emb, shape=[-1, num_steps, hidden_size])
377
    if dropout is not None and dropout > 0.0:
C
ccrrong 已提交
378
        x_emb = paddle.nn.functional.dropout(
379
            x_emb,
C
ccrrong 已提交
380 381
            p=dropout,
            mode='upscale_in_train',
382
        )
383 384 385 386 387 388

    if rnn_model == "padding":
        rnn_out, last_hidden, last_cell = padding_rnn(
            x_emb,
            len=num_steps,
            init_hidden=init_hidden_reshape,
389 390
            init_cell=init_cell_reshape,
        )
391 392 393 394 395
    elif rnn_model == "static":
        rnn_out, last_hidden, last_cell = encoder_static(
            x_emb,
            len=num_steps,
            init_hidden=init_hidden_reshape,
396 397
            init_cell=init_cell_reshape,
        )
398 399 400 401
    else:
        print("type not support")
        return

402
    rnn_out = paddle.reshape(rnn_out, shape=[-1, num_steps, hidden_size])
403

404
    softmax_weight = paddle.create_parameter(
405 406 407
        [hidden_size, vocab_size],
        dtype="float32",
        name="softmax_weight",
408
        default_initializer=paddle.nn.initializer.Uniform(
409 410 411
            low=-init_scale, high=init_scale
        ),
    )
412
    softmax_bias = paddle.create_parameter(
413 414 415
        [vocab_size],
        dtype="float32",
        name='softmax_bias',
416
        default_initializer=paddle.nn.initializer.Uniform(
417 418 419
            low=-init_scale, high=init_scale
        ),
    )
420

K
kangguangli 已提交
421
    projection = paddle.matmul(rnn_out, softmax_weight)
422
    projection = paddle.add(projection, softmax_bias)
423
    projection = paddle.reshape(projection, shape=[-1, vocab_size])
424

425
    loss = paddle.nn.functional.softmax_with_cross_entropy(
426 427
        logits=projection, label=y, soft_label=False
    )
428

429
    loss = paddle.reshape(loss, shape=[-1, num_steps])
430
    loss = paddle.mean(loss, axis=[0])
431
    loss = paddle.sum(loss)
432 433 434 435 436 437 438 439 440

    loss.persistable = True
    last_cell.persistable = True
    last_hidden.persistable = True

    # This will feed last_hidden, last_cell to init_hidden, init_cell, which
    # can be used directly in next batch. This can avoid the fetching of
    # last_hidden and last_cell and feeding of init_hidden and init_cell in
    # each training step.
441 442
    paddle.assign(last_cell, output=init_cell)
    paddle.assign(last_hidden, output=init_hidden)
443 444

    feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
445
    return loss, last_hidden, last_cell, feeding_list
446 447


448
class PaddingRNNTestBase(unittest.TestCase):
449 450
    def setUp(self):
        self.reader = Reader()
451
        self.device_count = 1
452

453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474
        # The default exec_strategy used for PaddingRNN.
        # You can change it in set_customed_config.
        self.exec_strategy = fluid.ExecutionStrategy()
        self.exec_strategy.num_threads = self.device_count
        self.exec_strategy.num_iteration_per_drop_scope = 100

        # The default build_strategy used for PaddingRNN.
        # You can change it in set_customed_config.
        self.build_strategy = fluid.BuildStrategy()
        self.build_strategy.enable_inplace = True
        self.build_strategy.memory_optimize = False
        self.build_strategy.fuse_all_optimizer_ops = True

        # CPU executor is used for PaddingRNN default.
        # You can change to CUDA executor in set_customed_config.
        self.exe = Executor(fluid.CPUPlace())

    def set_customed_config(self):
        # This function will be called before training.
        # You can override the function to set your own config.
        pass

475
    def _prepare_program(self, config):
C
cnn 已提交
476
        paddle.seed(config.random_seed)
477 478 479 480
        self.main_program = fluid.Program()
        self.startup_program = fluid.Program()
        with fluid.program_guard(self.main_program, self.startup_program):
            with fluid.unique_name.guard():
481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
                res_vars = lm_model(
                    config.hidden_size,
                    config.vocab_size,
                    config.batch_size,
                    num_layers=config.num_layers,
                    num_steps=config.num_steps,
                    init_scale=config.init_scale,
                    dropout=config.dropout,
                    rnn_model=config.rnn_model,
                )
                (
                    self.loss,
                    self.last_hidden,
                    self.last_cell,
                    self.feed_order,
                ) = res_vars
497

498 499
                paddle.nn.clip.set_gradient_clip(
                    clip=paddle.nn.ClipGradByGlobalNorm(
500 501 502
                        clip_norm=config.max_grad_norm
                    )
                )
503

504
                self.learning_rate = paddle.static.create_global_var(
505 506 507 508
                    name="learning_rate",
                    shape=[1],
                    value=1.0,
                    dtype='float32',
509 510
                    persistable=True,
                )
511 512

                optimizer = fluid.optimizer.SGD(
513 514
                    learning_rate=self.learning_rate
                )
515
                optimizer.minimize(self.loss)
516

517 518
        self.exe.run(self.startup_program)

519
        self.train_program = self.main_program
520

521
    def _generate_init_data(self):
522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537
        init_hidden = np.zeros(
            (
                self.config.num_layers,
                self.config.batch_size,
                self.config.hidden_size,
            ),
            dtype='float32',
        )
        init_cell = np.zeros(
            (
                self.config.num_layers,
                self.config.batch_size,
                self.config.hidden_size,
            ),
            dtype='float32',
        )
538 539
        return init_hidden, init_cell

540
    def _generate_new_lr(self, epoch_id=0, device_count=1):
541 542 543 544
        new_lr = self.config.base_learning_rate * (
            self.config.lr_decay
            ** max(epoch_id + 1 - self.config.epoch_start_decay, 0.0)
        )
545 546 547
        lr = np.ones((self.device_count), dtype='float32') * new_lr
        return lr

548 549 550 551 552 553 554 555 556
    def _prepare_input(
        self,
        batch,
        init_hidden=None,
        init_cell=None,
        epoch_id=0,
        with_lr=True,
        device_count=1,
    ):
557 558 559 560 561 562 563 564 565 566 567 568
        x, y = batch
        x = x.reshape((-1, self.config.num_steps, 1))
        y = y.reshape((-1, 1))

        res = {}
        res['x'] = x
        res['y'] = y
        if init_hidden is not None:
            res['init_hidden'] = init_hidden
        if init_cell is not None:
            res['init_cell'] = init_cell
        if with_lr:
569
            res['learning_rate'] = self._generate_new_lr(epoch_id, device_count)
570 571
        return res

572
    def _train_an_epoch(self, epoch_id, use_program_cache=True):
573 574 575 576 577
        train_data_iter = self.reader.get_data_iter(self.config)

        total_loss = 0
        iters = 0

578
        init_hidden, init_cell = self._generate_init_data()
579 580
        ppl = np.zeros(shape=(0))
        for batch_id, batch in enumerate(train_data_iter):
581
            input_data_feed = self._prepare_input(
582 583 584 585 586
                batch,
                init_hidden=init_hidden,
                init_cell=init_cell,
                epoch_id=epoch_id,
                with_lr=True,
587 588 589 590 591 592 593 594 595 596 597 598 599 600
                device_count=self.device_count,
            )

            fetch_outs = self.exe.run(
                self.train_program,
                feed=input_data_feed,
                fetch_list=[
                    self.loss.name,
                    "learning_rate",
                    self.last_hidden.name,
                    self.last_cell.name,
                ],
                use_program_cache=use_program_cache,
            )
601 602 603 604 605 606 607 608 609 610 611 612 613

            cost_train = np.array(fetch_outs[0])
            lr = np.array(fetch_outs[1])
            init_hidden = np.array(fetch_outs[2])
            init_cell = np.array(fetch_outs[3])

            total_loss += cost_train
            iters += self.config.num_steps

            batch_ppl = np.exp(total_loss / iters)
            ppl = np.append(ppl, batch_ppl)
        return ppl

614
    def train(self, config, use_program_cache=True):
615 616
        self.set_customed_config()

617
        self.config = config
618
        self._prepare_program(config)
619 620
        ppl = np.zeros(shape=(0, config.batch_size))
        for epoch_id in range(config.max_epoch):
621
            train_ppl = self._train_an_epoch(epoch_id, use_program_cache)
622 623 624
            ppl = np.append(ppl, train_ppl)
        return ppl

625
    def compare_padding_static_mode(self, use_program_cache=True):
626
        '''
627
        Test that train ppl of padding mode is same to that of static graph mode
628
        '''
629
        config = RNNConfig('test', 'padding')
630
        with fluid.scope_guard(fluid.Scope()):
631
            padding_rnn_ppl = self.train(config, use_program_cache)
632
        config = RNNConfig('test', 'static')
633
        with fluid.scope_guard(fluid.Scope()):
634
            static_rnn_ppl = self.train(config, use_program_cache)
635
        np.testing.assert_allclose(padding_rnn_ppl, static_rnn_ppl, rtol=0.001)
636

637 638

class EagerDeletionPaddingRNNTest(PaddingRNNTestBase):
639 640
    def test_padding_mode_no_eager_deletion(self):
        '''
641
        Test that train ppl of padding mode is same to that of static graph mode without eager deletion
642 643
        '''
        fluid.core._set_eager_deletion_mode(-1.0, 1.0, True)
644
        # When parallel is True, use_program_cache does not make a difference.
645
        self.compare_padding_static_mode(use_program_cache=True)
646 647 648

    def test_padding_mode_eager_deletion(self):
        '''
649
        Test that train ppl of padding mode is same to that of static graph mode under eager deletion
650 651
        '''
        fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
652
        # When parallel is True, use_program_cache does not make a difference.
653
        self.compare_padding_static_mode(use_program_cache=True)
654 655 656 657


if __name__ == '__main__':
    unittest.main()