test_eager_deletion_padding_rnn.py 23.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import os
16
import unittest
17

18
import numpy as np
19

L
Leo Chen 已提交
20
import paddle
21 22 23 24 25 26 27
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid import ParamAttr
from paddle.fluid.contrib.layers import basic_lstm
from paddle.fluid.executor import Executor
from paddle.fluid.layers.control_flow import StaticRNN as PaddingRNN

28 29
os.environ["CPU_NUM"] = "1"

30

31
class RNNConfig:
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
    def __init__(self, model_type, rnn_model):
        self.model_type = model_type
        self.rnn_model = rnn_model

        self.vocab_size = 10000
        if self.model_type == "test":
            self.num_layers = 1
            self.batch_size = 2
            self.hidden_size = 10
            self.num_steps = 3
            self.init_scale = 0.1
            self.max_grad_norm = 5.0
            self.epoch_start_decay = 1
            self.max_epoch = 1
            self.dropout = 0.0
            self.lr_decay = 0.5
            self.base_learning_rate = 1.0
        elif self.model_type == "small":
            self.num_layers = 2
            self.batch_size = 20
            self.hidden_size = 200
            self.num_steps = 20
            self.init_scale = 0.1
            self.max_grad_norm = 5.0
            self.epoch_start_decay = 4
            self.max_epoch = 13
            self.dropout = 0.0
            self.lr_decay = 0.5
            self.base_learning_rate = 1.0
        elif self.model_type == "medium":
            self.num_layers = 2
            self.batch_size = 20
            self.hidden_size = 650
            self.num_steps = 35
            self.init_scale = 0.05
            self.max_grad_norm = 5.0
            self.epoch_start_decay = 6
            self.max_epoch = 39
            self.dropout = 0.5
            self.lr_decay = 0.8
            self.base_learning_rate = 1.0
        elif self.model_type == "large":
            self.num_layers = 2
            self.batch_size = 20
            self.hidden_size = 1500
            self.num_steps = 35
            self.init_scale = 0.04
            self.max_grad_norm = 10.0
            self.epoch_start_decay = 14
            self.max_epoch = 55
            self.dropout = 0.65
            self.lr_decay = 1.0 / 1.15
            self.base_learning_rate = 1.0
        else:
            raise ValueError('Unsupported model_type.')

        if rnn_model not in ('static', 'padding', 'cudnn', 'basic_lstm'):
            raise ValueError('Unsupported rnn_model.')

        self.batch_size = 12
        self.max_epoch = 3
        self.random_seed = 123


# Fake data reader for test
97
class Reader:
98 99
    def get_data_iter(self, rnn_config):
        for i in range(rnn_config.max_epoch):
100 101 102 103 104 105 106 107
            x = np.zeros(
                shape=(rnn_config.batch_size, rnn_config.num_steps),
                dtype='int64',
            )
            y = np.ones(
                shape=(rnn_config.batch_size, rnn_config.num_steps),
                dtype='int64',
            )
108 109 110 111
            yield (x, y)


# Model from PaddleNLP/models/language_model/lm_model.py in Paddle Models repo
112 113 114 115 116 117 118 119 120 121
def lm_model(
    hidden_size,
    vocab_size,
    batch_size,
    num_layers=2,
    num_steps=20,
    init_scale=0.1,
    dropout=None,
    rnn_model='static',
):
122 123 124 125 126 127 128 129
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
130
            weight_1 = paddle.create_parameter(
131 132 133 134
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
135 136 137
                    low=-init_scale, high=init_scale
                ),
            )
138
            weight_1_arr.append(weight_1)
139
            bias_1 = paddle.create_parameter(
140 141 142
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
143 144
                default_initializer=fluid.initializer.Constant(0.0),
            )
145 146
            bias_arr.append(bias_1)

2
201716010711 已提交
147
            pre_hidden = paddle.slice(
148 149
                init_hidden, axes=[0], starts=[i], ends=[i + 1]
            )
2
201716010711 已提交
150
            pre_cell = paddle.slice(
151 152
                init_cell, axes=[0], starts=[i], ends=[i + 1]
            )
153 154
            pre_hidden = paddle.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = paddle.reshape(pre_cell, shape=[-1, hidden_size])
155 156 157
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

158
        input_embedding = paddle.transpose(input_embedding, perm=[1, 0, 2])
159 160 161 162 163 164 165 166 167 168 169
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
K
kangguangli 已提交
170
                gate_input = paddle.matmul(x=nn, y=weight_1)
171

172
                gate_input = paddle.add(gate_input, bias)
2
201716010711 已提交
173
                i = paddle.slice(
174 175
                    gate_input, axes=[1], starts=[0], ends=[hidden_size]
                )
2
201716010711 已提交
176
                j = paddle.slice(
177 178 179 180 181
                    gate_input,
                    axes=[1],
                    starts=[hidden_size],
                    ends=[hidden_size * 2],
                )
2
201716010711 已提交
182
                f = paddle.slice(
183 184 185 186 187
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 2],
                    ends=[hidden_size * 3],
                )
2
201716010711 已提交
188
                o = paddle.slice(
189 190 191 192 193
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 3],
                    ends=[hidden_size * 4],
                )
194

195 196 197 198
                c = pre_cell * paddle.nn.functional.sigmoid(
                    f
                ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
                m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
199 200 201 202 203 204 205 206 207

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

208
                if dropout is not None and dropout > 0.0:
C
ccrrong 已提交
209
                    input = paddle.nn.functional.dropout(
210
                        input,
C
ccrrong 已提交
211 212
                        p=dropout,
                        mode='upscale_in_train',
213
                    )
214 215 216 217 218 219 220 221 222 223 224 225

            rnn.step_output(input)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
2
201716010711 已提交
226
            last_h = paddle.slice(
227 228
                m, axes=[0], starts=[num_steps - 1], ends=[num_steps]
            )
229
            last_hidden_array.append(last_h)
2
201716010711 已提交
230
            last_c = paddle.slice(
231 232
                c, axes=[0], starts=[num_steps - 1], ends=[num_steps]
            )
233
            last_cell_array.append(last_c)
234
        real_res = paddle.transpose(x=real_res, perm=[1, 0, 2])
235 236 237 238 239
        last_hidden = layers.concat(last_hidden_array, 0)
        last_cell = layers.concat(last_cell_array, 0)

        return real_res, last_hidden, last_cell

240 241 242
    def encoder_static(
        input_embedding, len=3, init_hidden=None, init_cell=None
    ):
243 244 245 246 247 248 249 250

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
251
            weight_1 = paddle.create_parameter(
252 253 254 255
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
256 257 258
                    low=-init_scale, high=init_scale
                ),
            )
259
            weight_1_arr.append(weight_1)
260
            bias_1 = paddle.create_parameter(
261 262 263
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
264 265
                default_initializer=fluid.initializer.Constant(0.0),
            )
266 267
            bias_arr.append(bias_1)

2
201716010711 已提交
268
            pre_hidden = paddle.slice(
269 270
                init_hidden, axes=[0], starts=[i], ends=[i + 1]
            )
2
201716010711 已提交
271
            pre_cell = paddle.slice(
272 273
                init_cell, axes=[0], starts=[i], ends=[i + 1]
            )
274 275
            pre_hidden = paddle.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = paddle.reshape(pre_cell, shape=[-1, hidden_size])
276 277 278 279
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
280 281 282
        sliced_inputs = layers.split(
            input_embedding, num_or_sections=len, dim=1
        )
283 284 285

        for index in range(len):
            input = sliced_inputs[index]
286
            input = paddle.reshape(input, shape=[-1, hidden_size])
287 288 289 290 291 292 293
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
K
kangguangli 已提交
294
                gate_input = paddle.matmul(x=nn, y=weight_1)
295

296
                gate_input = paddle.add(gate_input, bias)
297 298
                i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)

299 300 301 302
                c = pre_cell * paddle.nn.functional.sigmoid(
                    f
                ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
                m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
303 304 305 306 307

                hidden_array[k] = m
                cell_array[k] = c
                input = m

308
                if dropout is not None and dropout > 0.0:
C
ccrrong 已提交
309
                    input = paddle.nn.functional.dropout(
310
                        input,
C
ccrrong 已提交
311 312
                        p=dropout,
                        mode='upscale_in_train',
313
                    )
314 315 316 317

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
318 319
        last_hidden = paddle.reshape(
            last_hidden, shape=[-1, num_layers, hidden_size]
320
        )
321
        last_hidden = paddle.transpose(x=last_hidden, perm=[1, 0, 2])
322 323

        last_cell = layers.concat(cell_array, 1)
324
        last_cell = paddle.reshape(
325 326
            last_cell, shape=[-1, num_layers, hidden_size]
        )
327
        last_cell = paddle.transpose(x=last_cell, perm=[1, 0, 2])
328 329

        real_res = layers.concat(res, 0)
330
        real_res = paddle.reshape(real_res, shape=[len, -1, hidden_size])
331
        real_res = paddle.transpose(x=real_res, perm=[1, 0, 2])
332 333 334 335

        return real_res, last_hidden, last_cell

    batch_size_each = batch_size
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
    x = layers.data(
        name="x",
        shape=[batch_size_each, num_steps, 1],
        dtype='int64',
        append_batch_size=False,
    )
    y = layers.data(
        name="y",
        shape=[batch_size_each * num_steps, 1],
        dtype='int64',
        append_batch_size=False,
    )

    init_hidden = layers.data(
        name="init_hidden",
        shape=[num_layers, batch_size_each, hidden_size],
        dtype='float32',
        append_batch_size=False,
    )
    init_cell = layers.data(
        name="init_cell",
        shape=[num_layers, batch_size_each, hidden_size],
        dtype='float32',
        append_batch_size=False,
    )
361 362 363 364

    init_cell.persistable = True
    init_hidden.persistable = True

365
    init_hidden_reshape = paddle.reshape(
366 367
        init_hidden, shape=[num_layers, -1, hidden_size]
    )
368
    init_cell_reshape = paddle.reshape(
369 370
        init_cell, shape=[num_layers, -1, hidden_size]
    )
371 372 373 374 375 376 377 378

    x_emb = layers.embedding(
        input=x,
        size=[vocab_size, hidden_size],
        dtype='float32',
        is_sparse=False,
        param_attr=fluid.ParamAttr(
            name='embedding_para',
379 380 381 382 383 384
            initializer=fluid.initializer.UniformInitializer(
                low=-init_scale, high=init_scale
            ),
        ),
    )

385
    x_emb = paddle.reshape(x_emb, shape=[-1, num_steps, hidden_size])
386
    if dropout is not None and dropout > 0.0:
C
ccrrong 已提交
387
        x_emb = paddle.nn.functional.dropout(
388
            x_emb,
C
ccrrong 已提交
389 390
            p=dropout,
            mode='upscale_in_train',
391
        )
392 393 394 395 396 397

    if rnn_model == "padding":
        rnn_out, last_hidden, last_cell = padding_rnn(
            x_emb,
            len=num_steps,
            init_hidden=init_hidden_reshape,
398 399
            init_cell=init_cell_reshape,
        )
400 401 402 403 404
    elif rnn_model == "static":
        rnn_out, last_hidden, last_cell = encoder_static(
            x_emb,
            len=num_steps,
            init_hidden=init_hidden_reshape,
405 406
            init_cell=init_cell_reshape,
        )
407
    elif rnn_model == "basic_lstm":
408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423
        rnn_out, last_hidden, last_cell = basic_lstm(
            x_emb,
            init_hidden,
            init_cell,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout_prob=dropout,
            param_attr=ParamAttr(
                initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale
                )
            ),
            bias_attr=ParamAttr(initializer=fluid.initializer.Constant(0.0)),
            forget_bias=0.0,
        )
424 425 426 427
    else:
        print("type not support")
        return

428
    rnn_out = paddle.reshape(rnn_out, shape=[-1, num_steps, hidden_size])
429

430
    softmax_weight = paddle.create_parameter(
431 432 433 434
        [hidden_size, vocab_size],
        dtype="float32",
        name="softmax_weight",
        default_initializer=fluid.initializer.UniformInitializer(
435 436 437
            low=-init_scale, high=init_scale
        ),
    )
438
    softmax_bias = paddle.create_parameter(
439 440 441 442
        [vocab_size],
        dtype="float32",
        name='softmax_bias',
        default_initializer=fluid.initializer.UniformInitializer(
443 444 445
            low=-init_scale, high=init_scale
        ),
    )
446

K
kangguangli 已提交
447
    projection = paddle.matmul(rnn_out, softmax_weight)
448
    projection = paddle.add(projection, softmax_bias)
449
    projection = paddle.reshape(projection, shape=[-1, vocab_size])
450

451
    loss = paddle.nn.functional.softmax_with_cross_entropy(
452 453
        logits=projection, label=y, soft_label=False
    )
454

455
    loss = paddle.reshape(loss, shape=[-1, num_steps])
456
    loss = paddle.mean(loss, axis=[0])
457
    loss = paddle.sum(loss)
458 459 460 461 462 463 464 465 466 467 468 469 470

    loss.persistable = True
    last_cell.persistable = True
    last_hidden.persistable = True

    # This will feed last_hidden, last_cell to init_hidden, init_cell, which
    # can be used directly in next batch. This can avoid the fetching of
    # last_hidden and last_cell and feeding of init_hidden and init_cell in
    # each training step.
    layers.assign(input=last_cell, output=init_cell)
    layers.assign(input=last_hidden, output=init_hidden)

    feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
471
    return loss, last_hidden, last_cell, feeding_list
472 473


474
class PaddingRNNTestBase(unittest.TestCase):
475 476
    def setUp(self):
        self.reader = Reader()
477
        self.device_count = 1
478

479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501
        # The default exec_strategy used for PaddingRNN.
        # You can change it in set_customed_config.
        self.exec_strategy = fluid.ExecutionStrategy()
        self.exec_strategy.num_threads = self.device_count
        self.exec_strategy.num_iteration_per_drop_scope = 100

        # The default build_strategy used for PaddingRNN.
        # You can change it in set_customed_config.
        self.build_strategy = fluid.BuildStrategy()
        self.build_strategy.enable_inplace = True
        self.build_strategy.memory_optimize = False
        self.build_strategy.fuse_all_optimizer_ops = True

        # CPU executor is used for PaddingRNN default.
        # You can change to CUDA executor in set_customed_config.
        self.exe = Executor(fluid.CPUPlace())

    def set_customed_config(self):
        # This function will be called before training.
        # You can override the function to set your own config.
        pass

    def _prepare_program(self, config, parallel=True):
C
cnn 已提交
502
        paddle.seed(config.random_seed)
503 504 505 506
        self.main_program = fluid.Program()
        self.startup_program = fluid.Program()
        with fluid.program_guard(self.main_program, self.startup_program):
            with fluid.unique_name.guard():
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
                res_vars = lm_model(
                    config.hidden_size,
                    config.vocab_size,
                    config.batch_size,
                    num_layers=config.num_layers,
                    num_steps=config.num_steps,
                    init_scale=config.init_scale,
                    dropout=config.dropout,
                    rnn_model=config.rnn_model,
                )
                (
                    self.loss,
                    self.last_hidden,
                    self.last_cell,
                    self.feed_order,
                ) = res_vars
523 524 525

                fluid.clip.set_gradient_clip(
                    clip=fluid.clip.GradientClipByGlobalNorm(
526 527 528
                        clip_norm=config.max_grad_norm
                    )
                )
529

530
                self.learning_rate = paddle.static.create_global_var(
531 532 533 534
                    name="learning_rate",
                    shape=[1],
                    value=1.0,
                    dtype='float32',
535 536
                    persistable=True,
                )
537 538

                optimizer = fluid.optimizer.SGD(
539 540
                    learning_rate=self.learning_rate
                )
541
                optimizer.minimize(self.loss)
542

543 544
        self.exe.run(self.startup_program)

545 546
        if parallel:
            self.train_program = fluid.compiler.CompiledProgram(
547 548 549 550 551 552
                self.main_program
            ).with_data_parallel(
                loss_name=self.loss.name,
                build_strategy=self.build_strategy,
                exec_strategy=self.exec_strategy,
            )
553 554
        else:
            self.train_program = self.main_program
555

556
    def _generate_init_data(self):
557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572
        init_hidden = np.zeros(
            (
                self.config.num_layers,
                self.config.batch_size,
                self.config.hidden_size,
            ),
            dtype='float32',
        )
        init_cell = np.zeros(
            (
                self.config.num_layers,
                self.config.batch_size,
                self.config.hidden_size,
            ),
            dtype='float32',
        )
573 574
        return init_hidden, init_cell

575
    def _generate_new_lr(self, epoch_id=0, device_count=1):
576 577 578 579
        new_lr = self.config.base_learning_rate * (
            self.config.lr_decay
            ** max(epoch_id + 1 - self.config.epoch_start_decay, 0.0)
        )
580 581 582
        lr = np.ones((self.device_count), dtype='float32') * new_lr
        return lr

583 584 585 586 587 588 589 590 591
    def _prepare_input(
        self,
        batch,
        init_hidden=None,
        init_cell=None,
        epoch_id=0,
        with_lr=True,
        device_count=1,
    ):
592 593 594 595 596 597 598 599 600 601 602 603
        x, y = batch
        x = x.reshape((-1, self.config.num_steps, 1))
        y = y.reshape((-1, 1))

        res = {}
        res['x'] = x
        res['y'] = y
        if init_hidden is not None:
            res['init_hidden'] = init_hidden
        if init_cell is not None:
            res['init_cell'] = init_cell
        if with_lr:
604
            res['learning_rate'] = self._generate_new_lr(epoch_id, device_count)
605 606
        return res

607
    def _train_an_epoch(self, epoch_id, use_program_cache=True):
608 609 610 611 612
        train_data_iter = self.reader.get_data_iter(self.config)

        total_loss = 0
        iters = 0

613
        init_hidden, init_cell = self._generate_init_data()
614 615
        ppl = np.zeros(shape=(0))
        for batch_id, batch in enumerate(train_data_iter):
616
            input_data_feed = self._prepare_input(
617 618 619 620 621
                batch,
                init_hidden=init_hidden,
                init_cell=init_cell,
                epoch_id=epoch_id,
                with_lr=True,
622 623 624 625 626 627 628 629 630 631 632 633 634 635
                device_count=self.device_count,
            )

            fetch_outs = self.exe.run(
                self.train_program,
                feed=input_data_feed,
                fetch_list=[
                    self.loss.name,
                    "learning_rate",
                    self.last_hidden.name,
                    self.last_cell.name,
                ],
                use_program_cache=use_program_cache,
            )
636 637 638 639 640 641 642 643 644 645 646 647 648

            cost_train = np.array(fetch_outs[0])
            lr = np.array(fetch_outs[1])
            init_hidden = np.array(fetch_outs[2])
            init_cell = np.array(fetch_outs[3])

            total_loss += cost_train
            iters += self.config.num_steps

            batch_ppl = np.exp(total_loss / iters)
            ppl = np.append(ppl, batch_ppl)
        return ppl

649
    def train(self, config, parallel=True, use_program_cache=True):
650 651
        self.set_customed_config()

652
        self.config = config
653
        self._prepare_program(config, parallel)
654 655
        ppl = np.zeros(shape=(0, config.batch_size))
        for epoch_id in range(config.max_epoch):
656
            train_ppl = self._train_an_epoch(epoch_id, use_program_cache)
657 658 659
            ppl = np.append(ppl, train_ppl)
        return ppl

660 661 662
    def compare_padding_static_mode(
        self, parallel=True, use_program_cache=True
    ):
663
        '''
664
        Test that train ppl of padding mode is same to that of static mode
665
        '''
666
        config = RNNConfig('test', 'padding')
667
        with fluid.scope_guard(fluid.Scope()):
668 669
            padding_rnn_ppl = self.train(config, parallel, use_program_cache)
        config = RNNConfig('test', 'static')
670
        with fluid.scope_guard(fluid.Scope()):
671
            static_rnn_ppl = self.train(config, parallel, use_program_cache)
672
        np.testing.assert_allclose(padding_rnn_ppl, static_rnn_ppl, rtol=0.001)
673

674 675

class EagerDeletionPaddingRNNTest(PaddingRNNTestBase):
676 677
    def test_padding_mode_no_eager_deletion(self):
        '''
678
        Test that train ppl of padding mode is same to that of static mode without eager deletion
679 680
        '''
        fluid.core._set_eager_deletion_mode(-1.0, 1.0, True)
681 682
        # When parallel is True, use_program_cache does not make a difference.
        self.compare_padding_static_mode(parallel=True, use_program_cache=True)
683 684 685

    def test_padding_mode_eager_deletion(self):
        '''
686
        Test that train ppl of padding mode is same to that of static mode under eager deletion
687 688
        '''
        fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
689 690
        # When parallel is True, use_program_cache does not make a difference.
        self.compare_padding_static_mode(parallel=True, use_program_cache=True)
691 692 693 694


if __name__ == '__main__':
    unittest.main()