test_eager_deletion_padding_rnn.py 15.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import os
16
import unittest
17

18
import numpy as np
19

L
Leo Chen 已提交
20
import paddle
21 22
from paddle import fluid
from paddle.fluid import layers
23 24
from paddle.fluid.executor import Executor

25 26
os.environ["CPU_NUM"] = "1"

27

28
class RNNConfig:
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
    def __init__(self, model_type, rnn_model):
        self.model_type = model_type
        self.rnn_model = rnn_model

        self.vocab_size = 10000
        if self.model_type == "test":
            self.num_layers = 1
            self.batch_size = 2
            self.hidden_size = 10
            self.num_steps = 3
            self.init_scale = 0.1
            self.max_grad_norm = 5.0
            self.epoch_start_decay = 1
            self.max_epoch = 1
            self.dropout = 0.0
            self.lr_decay = 0.5
            self.base_learning_rate = 1.0
        elif self.model_type == "small":
            self.num_layers = 2
            self.batch_size = 20
            self.hidden_size = 200
            self.num_steps = 20
            self.init_scale = 0.1
            self.max_grad_norm = 5.0
            self.epoch_start_decay = 4
            self.max_epoch = 13
            self.dropout = 0.0
            self.lr_decay = 0.5
            self.base_learning_rate = 1.0
        elif self.model_type == "medium":
            self.num_layers = 2
            self.batch_size = 20
            self.hidden_size = 650
            self.num_steps = 35
            self.init_scale = 0.05
            self.max_grad_norm = 5.0
            self.epoch_start_decay = 6
            self.max_epoch = 39
            self.dropout = 0.5
            self.lr_decay = 0.8
            self.base_learning_rate = 1.0
        elif self.model_type == "large":
            self.num_layers = 2
            self.batch_size = 20
            self.hidden_size = 1500
            self.num_steps = 35
            self.init_scale = 0.04
            self.max_grad_norm = 10.0
            self.epoch_start_decay = 14
            self.max_epoch = 55
            self.dropout = 0.65
            self.lr_decay = 1.0 / 1.15
            self.base_learning_rate = 1.0
        else:
            raise ValueError('Unsupported model_type.')

85
        if rnn_model not in ('static', 'cudnn'):
86 87 88 89 90 91 92 93
            raise ValueError('Unsupported rnn_model.')

        self.batch_size = 12
        self.max_epoch = 3
        self.random_seed = 123


# Fake data reader for test
94
class Reader:
95 96
    def get_data_iter(self, rnn_config):
        for i in range(rnn_config.max_epoch):
97 98 99 100 101 102 103 104
            x = np.zeros(
                shape=(rnn_config.batch_size, rnn_config.num_steps),
                dtype='int64',
            )
            y = np.ones(
                shape=(rnn_config.batch_size, rnn_config.num_steps),
                dtype='int64',
            )
105 106 107 108
            yield (x, y)


# Model from PaddleNLP/models/language_model/lm_model.py in Paddle Models repo
109 110 111 112 113 114 115 116 117 118 119 120 121
def lm_model(
    hidden_size,
    vocab_size,
    batch_size,
    num_layers=2,
    num_steps=20,
    init_scale=0.1,
    dropout=None,
    rnn_model='static',
):
    def encoder_static(
        input_embedding, len=3, init_hidden=None, init_cell=None
    ):
122 123 124 125 126 127 128
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
129
            weight_1 = paddle.create_parameter(
130 131 132
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
133
                default_initializer=paddle.nn.initializer.Uniform(
134 135 136
                    low=-init_scale, high=init_scale
                ),
            )
137
            weight_1_arr.append(weight_1)
138
            bias_1 = paddle.create_parameter(
139 140 141
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
142
                default_initializer=paddle.nn.initializer.Constant(0.0),
143
            )
144 145
            bias_arr.append(bias_1)

2
201716010711 已提交
146
            pre_hidden = paddle.slice(
147 148
                init_hidden, axes=[0], starts=[i], ends=[i + 1]
            )
2
201716010711 已提交
149
            pre_cell = paddle.slice(
150 151
                init_cell, axes=[0], starts=[i], ends=[i + 1]
            )
152 153
            pre_hidden = paddle.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = paddle.reshape(pre_cell, shape=[-1, hidden_size])
154 155 156 157
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
158 159
        sliced_inputs = paddle.split(
            input_embedding, num_or_sections=len, axis=1
160
        )
161 162 163

        for index in range(len):
            input = sliced_inputs[index]
164
            input = paddle.reshape(input, shape=[-1, hidden_size])
165 166 167 168 169 170
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

171
                nn = paddle.concat([input, pre_hidden], 1)
K
kangguangli 已提交
172
                gate_input = paddle.matmul(x=nn, y=weight_1)
173

174
                gate_input = paddle.add(gate_input, bias)
175 176 177
                i, j, f, o = paddle.split(
                    gate_input, num_or_sections=4, axis=-1
                )
178

179 180 181 182
                c = pre_cell * paddle.nn.functional.sigmoid(
                    f
                ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
                m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
183 184 185 186 187

                hidden_array[k] = m
                cell_array[k] = c
                input = m

188
                if dropout is not None and dropout > 0.0:
C
ccrrong 已提交
189
                    input = paddle.nn.functional.dropout(
190
                        input,
C
ccrrong 已提交
191 192
                        p=dropout,
                        mode='upscale_in_train',
193
                    )
194 195 196

            res.append(input)

197
        last_hidden = paddle.concat(hidden_array, 1)
198 199
        last_hidden = paddle.reshape(
            last_hidden, shape=[-1, num_layers, hidden_size]
200
        )
201
        last_hidden = paddle.transpose(x=last_hidden, perm=[1, 0, 2])
202

203
        last_cell = paddle.concat(cell_array, 1)
204
        last_cell = paddle.reshape(
205 206
            last_cell, shape=[-1, num_layers, hidden_size]
        )
207
        last_cell = paddle.transpose(x=last_cell, perm=[1, 0, 2])
208

209
        real_res = paddle.concat(res, 0)
210
        real_res = paddle.reshape(real_res, shape=[len, -1, hidden_size])
211
        real_res = paddle.transpose(x=real_res, perm=[1, 0, 2])
212 213 214 215

        return real_res, last_hidden, last_cell

    batch_size_each = batch_size
G
GGBond8488 已提交
216 217
    x = paddle.static.data(
        name="x", shape=[batch_size_each, num_steps, 1], dtype='int64'
218
    )
G
GGBond8488 已提交
219 220
    y = paddle.static.data(
        name="y", shape=[batch_size_each * num_steps, 1], dtype='int64'
221 222
    )

G
GGBond8488 已提交
223
    init_hidden = paddle.static.data(
224 225 226 227
        name="init_hidden",
        shape=[num_layers, batch_size_each, hidden_size],
        dtype='float32',
    )
G
GGBond8488 已提交
228
    init_cell = paddle.static.data(
229 230 231 232
        name="init_cell",
        shape=[num_layers, batch_size_each, hidden_size],
        dtype='float32',
    )
233 234 235 236

    init_cell.persistable = True
    init_hidden.persistable = True

237
    init_hidden_reshape = paddle.reshape(
238 239
        init_hidden, shape=[num_layers, -1, hidden_size]
    )
240
    init_cell_reshape = paddle.reshape(
241 242
        init_cell, shape=[num_layers, -1, hidden_size]
    )
243 244 245 246 247 248 249 250

    x_emb = layers.embedding(
        input=x,
        size=[vocab_size, hidden_size],
        dtype='float32',
        is_sparse=False,
        param_attr=fluid.ParamAttr(
            name='embedding_para',
251
            initializer=paddle.nn.initializer.Uniform(
252 253 254 255 256
                low=-init_scale, high=init_scale
            ),
        ),
    )

257
    x_emb = paddle.reshape(x_emb, shape=[-1, num_steps, hidden_size])
258
    if dropout is not None and dropout > 0.0:
C
ccrrong 已提交
259
        x_emb = paddle.nn.functional.dropout(
260
            x_emb,
C
ccrrong 已提交
261 262
            p=dropout,
            mode='upscale_in_train',
263
        )
264

265
    if rnn_model == "static":
266 267 268 269
        rnn_out, last_hidden, last_cell = encoder_static(
            x_emb,
            len=num_steps,
            init_hidden=init_hidden_reshape,
270 271
            init_cell=init_cell_reshape,
        )
272 273 274 275
    else:
        print("type not support")
        return

276
    rnn_out = paddle.reshape(rnn_out, shape=[-1, num_steps, hidden_size])
277

278
    softmax_weight = paddle.create_parameter(
279 280 281
        [hidden_size, vocab_size],
        dtype="float32",
        name="softmax_weight",
282
        default_initializer=paddle.nn.initializer.Uniform(
283 284 285
            low=-init_scale, high=init_scale
        ),
    )
286
    softmax_bias = paddle.create_parameter(
287 288 289
        [vocab_size],
        dtype="float32",
        name='softmax_bias',
290
        default_initializer=paddle.nn.initializer.Uniform(
291 292 293
            low=-init_scale, high=init_scale
        ),
    )
294

K
kangguangli 已提交
295
    projection = paddle.matmul(rnn_out, softmax_weight)
296
    projection = paddle.add(projection, softmax_bias)
297
    projection = paddle.reshape(projection, shape=[-1, vocab_size])
298

299
    loss = paddle.nn.functional.softmax_with_cross_entropy(
300 301
        logits=projection, label=y, soft_label=False
    )
302

303
    loss = paddle.reshape(loss, shape=[-1, num_steps])
304
    loss = paddle.mean(loss, axis=[0])
305
    loss = paddle.sum(loss)
306 307 308 309 310 311 312 313 314

    loss.persistable = True
    last_cell.persistable = True
    last_hidden.persistable = True

    # This will feed last_hidden, last_cell to init_hidden, init_cell, which
    # can be used directly in next batch. This can avoid the fetching of
    # last_hidden and last_cell and feeding of init_hidden and init_cell in
    # each training step.
315 316
    paddle.assign(last_cell, output=init_cell)
    paddle.assign(last_hidden, output=init_hidden)
317 318

    feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
319
    return loss, last_hidden, last_cell, feeding_list
320 321


322
class PaddingRNNTestBase(unittest.TestCase):
323 324
    def setUp(self):
        self.reader = Reader()
325
        self.device_count = 1
326

327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
        # The default exec_strategy used for PaddingRNN.
        # You can change it in set_customed_config.
        self.exec_strategy = fluid.ExecutionStrategy()
        self.exec_strategy.num_threads = self.device_count
        self.exec_strategy.num_iteration_per_drop_scope = 100

        # The default build_strategy used for PaddingRNN.
        # You can change it in set_customed_config.
        self.build_strategy = fluid.BuildStrategy()
        self.build_strategy.enable_inplace = True
        self.build_strategy.memory_optimize = False
        self.build_strategy.fuse_all_optimizer_ops = True

        # CPU executor is used for PaddingRNN default.
        # You can change to CUDA executor in set_customed_config.
        self.exe = Executor(fluid.CPUPlace())

    def set_customed_config(self):
        # This function will be called before training.
        # You can override the function to set your own config.
        pass

349
    def _prepare_program(self, config):
C
cnn 已提交
350
        paddle.seed(config.random_seed)
351 352 353 354
        self.main_program = fluid.Program()
        self.startup_program = fluid.Program()
        with fluid.program_guard(self.main_program, self.startup_program):
            with fluid.unique_name.guard():
355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370
                res_vars = lm_model(
                    config.hidden_size,
                    config.vocab_size,
                    config.batch_size,
                    num_layers=config.num_layers,
                    num_steps=config.num_steps,
                    init_scale=config.init_scale,
                    dropout=config.dropout,
                    rnn_model=config.rnn_model,
                )
                (
                    self.loss,
                    self.last_hidden,
                    self.last_cell,
                    self.feed_order,
                ) = res_vars
371

372 373
                paddle.nn.clip.set_gradient_clip(
                    clip=paddle.nn.ClipGradByGlobalNorm(
374 375 376
                        clip_norm=config.max_grad_norm
                    )
                )
377

378
                self.learning_rate = paddle.static.create_global_var(
379 380 381 382
                    name="learning_rate",
                    shape=[1],
                    value=1.0,
                    dtype='float32',
383 384
                    persistable=True,
                )
385 386

                optimizer = fluid.optimizer.SGD(
387 388
                    learning_rate=self.learning_rate
                )
389
                optimizer.minimize(self.loss)
390

391 392
        self.exe.run(self.startup_program)

393
        self.train_program = self.main_program
394

395
    def _generate_init_data(self):
396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
        init_hidden = np.zeros(
            (
                self.config.num_layers,
                self.config.batch_size,
                self.config.hidden_size,
            ),
            dtype='float32',
        )
        init_cell = np.zeros(
            (
                self.config.num_layers,
                self.config.batch_size,
                self.config.hidden_size,
            ),
            dtype='float32',
        )
412 413
        return init_hidden, init_cell

414
    def _generate_new_lr(self, epoch_id=0, device_count=1):
415 416 417 418
        new_lr = self.config.base_learning_rate * (
            self.config.lr_decay
            ** max(epoch_id + 1 - self.config.epoch_start_decay, 0.0)
        )
419 420 421
        lr = np.ones((self.device_count), dtype='float32') * new_lr
        return lr

422 423 424 425 426 427 428 429 430
    def _prepare_input(
        self,
        batch,
        init_hidden=None,
        init_cell=None,
        epoch_id=0,
        with_lr=True,
        device_count=1,
    ):
431 432 433 434 435 436 437 438 439 440 441 442
        x, y = batch
        x = x.reshape((-1, self.config.num_steps, 1))
        y = y.reshape((-1, 1))

        res = {}
        res['x'] = x
        res['y'] = y
        if init_hidden is not None:
            res['init_hidden'] = init_hidden
        if init_cell is not None:
            res['init_cell'] = init_cell
        if with_lr:
443
            res['learning_rate'] = self._generate_new_lr(epoch_id, device_count)
444 445
        return res

446
    def _train_an_epoch(self, epoch_id, use_program_cache=True):
447 448 449 450 451
        train_data_iter = self.reader.get_data_iter(self.config)

        total_loss = 0
        iters = 0

452
        init_hidden, init_cell = self._generate_init_data()
453 454
        ppl = np.zeros(shape=(0))
        for batch_id, batch in enumerate(train_data_iter):
455
            input_data_feed = self._prepare_input(
456 457 458 459 460
                batch,
                init_hidden=init_hidden,
                init_cell=init_cell,
                epoch_id=epoch_id,
                with_lr=True,
461 462 463 464 465 466 467 468 469 470 471 472 473 474
                device_count=self.device_count,
            )

            fetch_outs = self.exe.run(
                self.train_program,
                feed=input_data_feed,
                fetch_list=[
                    self.loss.name,
                    "learning_rate",
                    self.last_hidden.name,
                    self.last_cell.name,
                ],
                use_program_cache=use_program_cache,
            )
475 476 477 478 479 480 481 482 483 484 485 486 487

            cost_train = np.array(fetch_outs[0])
            lr = np.array(fetch_outs[1])
            init_hidden = np.array(fetch_outs[2])
            init_cell = np.array(fetch_outs[3])

            total_loss += cost_train
            iters += self.config.num_steps

            batch_ppl = np.exp(total_loss / iters)
            ppl = np.append(ppl, batch_ppl)
        return ppl

488
    def train(self, config, use_program_cache=True):
489 490
        self.set_customed_config()

491
        self.config = config
492
        self._prepare_program(config)
493 494
        ppl = np.zeros(shape=(0, config.batch_size))
        for epoch_id in range(config.max_epoch):
495
            train_ppl = self._train_an_epoch(epoch_id, use_program_cache)
496 497 498 499 500 501
            ppl = np.append(ppl, train_ppl)
        return ppl


if __name__ == '__main__':
    unittest.main()