seq2seq_dygraph_model.py 31.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
# -*- coding: utf-8 -*-
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
17
import paddle
18 19 20 21 22 23 24 25 26
import paddle.fluid as fluid
from paddle.fluid import ParamAttr
from paddle.fluid import layers
from paddle.fluid.dygraph import Layer
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph.jit import declarative
from paddle.fluid.dygraph.nn import Embedding
from seq2seq_utils import Seq2SeqModelHyperParams as args

27
INF = 1.0 * 1e5
28
alpha = 0.6
29 30 31
uniform_initializer = lambda x: fluid.initializer.UniformInitializer(
    low=-x, high=x
)
32 33 34 35
zero_constant = fluid.initializer.Constant(0.0)


class BasicLSTMUnit(Layer):
36 37 38 39 40 41 42 43 44 45 46
    def __init__(
        self,
        hidden_size,
        input_size,
        param_attr=None,
        bias_attr=None,
        gate_activation=None,
        activation=None,
        forget_bias=1.0,
        dtype='float32',
    ):
47
        super().__init__(dtype)
48 49 50 51

        self._hiden_size = hidden_size
        self._param_attr = param_attr
        self._bias_attr = bias_attr
52 53
        self._gate_activation = gate_activation or paddle.nn.functional.sigmoid
        self._activation = activation or paddle.tanh
54 55 56 57 58 59 60
        self._forget_bias = forget_bias
        self._dtype = dtype
        self._input_size = input_size

        self._weight = self.create_parameter(
            attr=self._param_attr,
            shape=[self._input_size + self._hiden_size, 4 * self._hiden_size],
61 62
            dtype=self._dtype,
        )
63

64 65 66 67 68 69
        self._bias = self.create_parameter(
            attr=self._bias_attr,
            shape=[4 * self._hiden_size],
            dtype=self._dtype,
            is_bias=True,
        )
70 71 72 73 74 75 76 77

    def forward(self, input, pre_hidden, pre_cell):
        concat_input_hidden = layers.concat([input, pre_hidden], 1)
        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)

        gate_input = layers.elementwise_add(gate_input, self._bias)
        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
        new_cell = layers.elementwise_add(
78
            layers.elementwise_mul(
79 80 81 82
                pre_cell, paddle.nn.functional.sigmoid(f + self._forget_bias)
            ),
            layers.elementwise_mul(
                paddle.nn.functional.sigmoid(i), paddle.tanh(j)
83 84
            ),
        )
85

86
        new_hidden = paddle.tanh(new_cell) * paddle.nn.functional.sigmoid(o)
87 88 89 90 91

        return new_hidden, new_cell


class BaseModel(fluid.dygraph.Layer):
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
    def __init__(
        self,
        hidden_size,
        src_vocab_size,
        tar_vocab_size,
        batch_size,
        num_layers=1,
        init_scale=0.1,
        dropout=None,
        beam_size=1,
        beam_start_token=1,
        beam_end_token=2,
        beam_max_step_num=2,
        mode='train',
    ):
107
        super().__init__()
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
        self.hidden_size = hidden_size
        self.src_vocab_size = src_vocab_size
        self.tar_vocab_size = tar_vocab_size
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.init_scale = init_scale
        self.dropout = dropout
        self.beam_size = beam_size
        self.beam_start_token = beam_start_token
        self.beam_end_token = beam_end_token
        self.beam_max_step_num = beam_max_step_num
        self.mode = mode
        self.kinf = 1e9

        param_attr = ParamAttr(initializer=uniform_initializer(self.init_scale))
        bias_attr = ParamAttr(initializer=zero_constant)
        forget_bias = 1.0

        self.src_embeder = Embedding(
            size=[self.src_vocab_size, self.hidden_size],
            param_attr=fluid.ParamAttr(
129 130 131
                initializer=uniform_initializer(init_scale)
            ),
        )
132 133 134 135 136

        self.tar_embeder = Embedding(
            size=[self.tar_vocab_size, self.hidden_size],
            is_sparse=False,
            param_attr=fluid.ParamAttr(
137 138 139
                initializer=uniform_initializer(init_scale)
            ),
        )
140 141 142 143 144 145

        self.enc_units = []
        for i in range(num_layers):
            self.enc_units.append(
                self.add_sublayer(
                    "enc_units_%d" % i,
146 147 148 149 150 151 152 153 154
                    BasicLSTMUnit(
                        hidden_size=self.hidden_size,
                        input_size=self.hidden_size,
                        param_attr=param_attr,
                        bias_attr=bias_attr,
                        forget_bias=forget_bias,
                    ),
                )
            )
155 156 157 158 159 160

        self.dec_units = []
        for i in range(num_layers):
            self.dec_units.append(
                self.add_sublayer(
                    "dec_units_%d" % i,
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
                    BasicLSTMUnit(
                        hidden_size=self.hidden_size,
                        input_size=self.hidden_size,
                        param_attr=param_attr,
                        bias_attr=bias_attr,
                        forget_bias=forget_bias,
                    ),
                )
            )

        self.fc = fluid.dygraph.nn.Linear(
            self.hidden_size,
            self.tar_vocab_size,
            param_attr=param_attr,
            bias_attr=False,
        )
177 178

    def _transpose_batch_time(self, x):
179
        return paddle.transpose(x, [1, 0] + list(range(2, len(x.shape))))
180 181

    def _merge_batch_beams(self, x):
182
        return paddle.reshape(x, shape=(-1, x.shape[2]))
183 184

    def _split_batch_beams(self, x):
185
        return paddle.reshape(x, shape=(-1, self.beam_size, x.shape[1]))
186 187 188 189 190 191 192 193 194

    def _expand_to_beam_size(self, x):
        x = fluid.layers.unsqueeze(x, [1])
        expand_times = [1] * len(x.shape)
        expand_times[1] = self.beam_size
        x = fluid.layers.expand(x, expand_times)
        return x

    def _real_state(self, state, new_state, step_mask):
195 196 197
        new_state = fluid.layers.elementwise_mul(
            new_state, step_mask, axis=0
        ) - fluid.layers.elementwise_mul(state, (step_mask - 1), axis=0)
198 199 200
        return new_state

    def _gather(self, x, indices, batch_pos):
201
        topk_coordinates = paddle.stack([batch_pos, indices], axis=2)
202 203 204 205 206 207 208 209 210 211 212 213 214
        return fluid.layers.gather_nd(x, topk_coordinates)

    @declarative
    def forward(self, inputs):
        src, tar, label, src_sequence_length, tar_sequence_length = inputs
        if src.shape[0] < self.batch_size:
            self.batch_size = src.shape[0]

        src_emb = self.src_embeder(self._transpose_batch_time(src))

        # NOTE: modify model code about `enc_hidden` and `enc_cell` to transforme dygraph code successfully.
        # Because nested list can't be transformed now.
        enc_hidden_0 = to_variable(
215 216
            np.zeros((self.batch_size, self.hidden_size), dtype='float32')
        )
217
        enc_cell_0 = to_variable(
218 219
            np.zeros((self.batch_size, self.hidden_size), dtype='float32')
        )
220 221 222 223 224
        zero = fluid.layers.zeros(shape=[1], dtype="int64")
        enc_hidden = fluid.layers.create_array(dtype="float32")
        enc_cell = fluid.layers.create_array(dtype="float32")
        for i in range(self.num_layers):
            index = zero + i
225 226 227 228 229 230
            enc_hidden = fluid.layers.array_write(
                enc_hidden_0, index, array=enc_hidden
            )
            enc_cell = fluid.layers.array_write(
                enc_cell_0, index, array=enc_cell
            )
231 232 233

        max_seq_len = src_emb.shape[0]

234 235 236
        enc_len_mask = fluid.layers.sequence_mask(
            src_sequence_length, maxlen=max_seq_len, dtype="float32"
        )
237
        enc_len_mask = paddle.transpose(enc_len_mask, [1, 0])
238 239 240 241 242 243 244 245 246 247

        # TODO: Because diff exits if call while_loop in static graph.
        # In while block, a Variable created in parent block participates in the calculation of gradient,
        # the gradient is wrong because each step scope always returns the same value generated by last step.
        # NOTE: Replace max_seq_len(Tensor src_emb.shape[0]) with args.max_seq_len(int) to avoid this bug temporarily.
        for k in range(args.max_seq_len):
            enc_step_input = src_emb[k]
            step_mask = enc_len_mask[k]
            new_enc_hidden, new_enc_cell = [], []
            for i in range(self.num_layers):
248 249 250
                enc_new_hidden, enc_new_cell = self.enc_units[i](
                    enc_step_input, enc_hidden[i], enc_cell[i]
                )
251
                if self.dropout is not None and self.dropout > 0.0:
252 253 254
                    enc_step_input = fluid.layers.dropout(
                        enc_new_hidden,
                        dropout_prob=self.dropout,
255 256
                        dropout_implementation='upscale_in_train',
                    )
257 258 259 260
                else:
                    enc_step_input = enc_new_hidden

                new_enc_hidden.append(
261 262
                    self._real_state(enc_hidden[i], enc_new_hidden, step_mask)
                )
263
                new_enc_cell.append(
264 265
                    self._real_state(enc_cell[i], enc_new_cell, step_mask)
                )
266 267 268 269 270 271 272 273 274 275 276 277

            enc_hidden, enc_cell = new_enc_hidden, new_enc_cell

        dec_hidden, dec_cell = enc_hidden, enc_cell
        tar_emb = self.tar_embeder(self._transpose_batch_time(tar))
        max_seq_len = tar_emb.shape[0]
        dec_output = []
        for step_idx in range(max_seq_len):
            j = step_idx + 0
            step_input = tar_emb[j]
            new_dec_hidden, new_dec_cell = [], []
            for i in range(self.num_layers):
278 279 280
                new_hidden, new_cell = self.dec_units[i](
                    step_input, dec_hidden[i], dec_cell[i]
                )
281 282
                new_dec_hidden.append(new_hidden)
                new_dec_cell.append(new_cell)
283
                if self.dropout is not None and self.dropout > 0.0:
284 285 286
                    step_input = fluid.layers.dropout(
                        new_hidden,
                        dropout_prob=self.dropout,
287 288
                        dropout_implementation='upscale_in_train',
                    )
289 290 291 292
                else:
                    step_input = new_hidden
            dec_output.append(step_input)

293
        dec_output = paddle.stack(dec_output)
294
        dec_output = self.fc(self._transpose_batch_time(dec_output))
295 296 297
        loss = fluid.layers.softmax_with_cross_entropy(
            logits=dec_output, label=label, soft_label=False
        )
298 299
        loss = fluid.layers.squeeze(loss, axes=[2])
        max_tar_seq_len = fluid.layers.shape(tar)[1]
300 301 302
        tar_mask = fluid.layers.sequence_mask(
            tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32'
        )
303 304 305 306 307 308 309 310 311 312 313 314 315 316
        loss = loss * tar_mask
        loss = fluid.layers.reduce_mean(loss, dim=[0])
        loss = fluid.layers.reduce_sum(loss)

        return loss

    @declarative
    def beam_search(self, inputs):
        src, tar, label, src_sequence_length, tar_sequence_length = inputs
        if src.shape[0] < self.batch_size:
            self.batch_size = src.shape[0]

        src_emb = self.src_embeder(self._transpose_batch_time(src))
        enc_hidden_0 = to_variable(
317 318
            np.zeros((self.batch_size, self.hidden_size), dtype='float32')
        )
319
        enc_cell_0 = to_variable(
320 321
            np.zeros((self.batch_size, self.hidden_size), dtype='float32')
        )
322 323 324 325 326
        zero = fluid.layers.zeros(shape=[1], dtype="int64")
        enc_hidden = fluid.layers.create_array(dtype="float32")
        enc_cell = fluid.layers.create_array(dtype="float32")
        for j in range(self.num_layers):
            index = zero + j
327 328 329 330 331 332
            enc_hidden = fluid.layers.array_write(
                enc_hidden_0, index, array=enc_hidden
            )
            enc_cell = fluid.layers.array_write(
                enc_cell_0, index, array=enc_cell
            )
333 334 335

        max_seq_len = src_emb.shape[0]

336 337 338
        enc_len_mask = fluid.layers.sequence_mask(
            src_sequence_length, maxlen=max_seq_len, dtype="float32"
        )
339
        enc_len_mask = paddle.transpose(enc_len_mask, [1, 0])
340 341 342 343 344 345 346 347

        for k in range(args.max_seq_len):
            enc_step_input = src_emb[k]
            step_mask = enc_len_mask[k]

            new_enc_hidden, new_enc_cell = [], []

            for i in range(self.num_layers):
348 349 350
                enc_new_hidden, enc_new_cell = self.enc_units[i](
                    enc_step_input, enc_hidden[i], enc_cell[i]
                )
351
                if self.dropout is not None and self.dropout > 0.0:
352 353 354
                    enc_step_input = fluid.layers.dropout(
                        enc_new_hidden,
                        dropout_prob=self.dropout,
355 356
                        dropout_implementation='upscale_in_train',
                    )
357 358 359 360
                else:
                    enc_step_input = enc_new_hidden

                new_enc_hidden.append(
361 362
                    self._real_state(enc_hidden[i], enc_new_hidden, step_mask)
                )
363
                new_enc_cell.append(
364 365
                    self._real_state(enc_cell[i], enc_new_cell, step_mask)
                )
366 367 368 369 370

            enc_hidden, enc_cell = new_enc_hidden, new_enc_cell

        # beam search
        batch_beam_shape = (self.batch_size, self.beam_size)
371 372 373
        vocab_size_tensor = to_variable(
            np.full((1), self.tar_vocab_size)
        ).astype("int64")
374
        start_token_tensor = to_variable(
375 376
            np.full(batch_beam_shape, self.beam_start_token, dtype='int64')
        )
377
        end_token_tensor = to_variable(
378 379
            np.full(batch_beam_shape, self.beam_end_token, dtype='int64')
        )
380 381
        step_input = self.tar_embeder(start_token_tensor)
        beam_finished = to_variable(
382 383
            np.full(batch_beam_shape, 0, dtype='float32')
        )
384
        beam_state_log_probs = to_variable(
385 386 387 388 389 390 391
            np.array(
                [[0.0] + [-self.kinf] * (self.beam_size - 1)], dtype="float32"
            )
        )
        beam_state_log_probs = fluid.layers.expand(
            beam_state_log_probs, [self.batch_size, 1]
        )
392 393 394 395 396 397
        dec_hidden, dec_cell = enc_hidden, enc_cell
        dec_hidden = [self._expand_to_beam_size(ele) for ele in dec_hidden]
        dec_cell = [self._expand_to_beam_size(ele) for ele in dec_cell]

        batch_pos = fluid.layers.expand(
            fluid.layers.unsqueeze(
398
                to_variable(np.arange(0, self.batch_size, 1, dtype="int64")),
399 400 401 402
                [1],
            ),
            [1, self.beam_size],
        )
403 404 405
        predicted_ids = []
        parent_ids = []

406
        for step_idx in range(paddle.to_tensor(self.beam_max_step_num)):
407 408 409 410 411 412 413 414 415 416 417
            if fluid.layers.reduce_sum(1 - beam_finished).numpy()[0] == 0:
                break
            step_input = self._merge_batch_beams(step_input)
            new_dec_hidden, new_dec_cell = [], []
            state = 0
            dec_hidden = [
                self._merge_batch_beams(state) for state in dec_hidden
            ]
            dec_cell = [self._merge_batch_beams(state) for state in dec_cell]

            for i in range(self.num_layers):
418 419 420
                new_hidden, new_cell = self.dec_units[i](
                    step_input, dec_hidden[i], dec_cell[i]
                )
421 422
                new_dec_hidden.append(new_hidden)
                new_dec_cell.append(new_cell)
423
                if self.dropout is not None and self.dropout > 0.0:
424 425 426
                    step_input = fluid.layers.dropout(
                        new_hidden,
                        dropout_prob=self.dropout,
427 428
                        dropout_implementation='upscale_in_train',
                    )
429 430
                else:
                    step_input = new_hidden
431

432 433 434 435
            cell_outputs = self._split_batch_beams(step_input)
            cell_outputs = self.fc(cell_outputs)

            step_log_probs = fluid.layers.log(
436 437
                fluid.layers.softmax(cell_outputs)
            )
438 439 440
            noend_array = [-self.kinf] * self.tar_vocab_size
            noend_array[self.beam_end_token] = 0
            noend_mask_tensor = to_variable(
441 442
                np.array(noend_array, dtype='float32')
            )
443 444

            step_log_probs = fluid.layers.elementwise_mul(
445 446 447 448 449 450 451 452 453 454 455 456
                fluid.layers.expand(
                    fluid.layers.unsqueeze(beam_finished, [2]),
                    [1, 1, self.tar_vocab_size],
                ),
                noend_mask_tensor,
                axis=-1,
            ) - fluid.layers.elementwise_mul(
                step_log_probs, (beam_finished - 1), axis=0
            )
            log_probs = fluid.layers.elementwise_add(
                x=step_log_probs, y=beam_state_log_probs, axis=0
            )
457
            scores = paddle.reshape(
458 459 460 461 462
                log_probs, [-1, self.beam_size * self.tar_vocab_size]
            )
            topk_scores, topk_indices = fluid.layers.topk(
                input=scores, k=self.beam_size
            )
463

464 465
            beam_indices = paddle.floor_divide(topk_indices, vocab_size_tensor)
            token_indices = paddle.remainder(topk_indices, vocab_size_tensor)
466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
            next_log_probs = self._gather(scores, topk_indices, batch_pos)

            x = 0
            new_dec_hidden = [
                self._split_batch_beams(state) for state in new_dec_hidden
            ]
            new_dec_cell = [
                self._split_batch_beams(state) for state in new_dec_cell
            ]
            new_dec_hidden = [
                self._gather(x, beam_indices, batch_pos) for x in new_dec_hidden
            ]
            new_dec_cell = [
                self._gather(x, beam_indices, batch_pos) for x in new_dec_cell
            ]

            new_dec_hidden = [
                self._gather(x, beam_indices, batch_pos) for x in new_dec_hidden
            ]
            new_dec_cell = [
                self._gather(x, beam_indices, batch_pos) for x in new_dec_cell
            ]
            next_finished = self._gather(beam_finished, beam_indices, batch_pos)
            next_finished = fluid.layers.cast(next_finished, "bool")
            next_finished = fluid.layers.logical_or(
                next_finished,
492 493
                fluid.layers.equal(token_indices, end_token_tensor),
            )
494 495 496 497 498 499 500 501 502
            next_finished = fluid.layers.cast(next_finished, "float32")

            dec_hidden, dec_cell = new_dec_hidden, new_dec_cell
            beam_finished = next_finished
            beam_state_log_probs = next_log_probs
            step_input = self.tar_embeder(token_indices)
            predicted_ids.append(token_indices)
            parent_ids.append(beam_indices)

503 504
        predicted_ids = paddle.stack(predicted_ids)
        parent_ids = paddle.stack(parent_ids)
505 506 507
        predicted_ids = fluid.layers.gather_tree(predicted_ids, parent_ids)
        predicted_ids = self._transpose_batch_time(predicted_ids)
        return predicted_ids
508 509 510


class AttentionModel(fluid.dygraph.Layer):
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525
    def __init__(
        self,
        hidden_size,
        src_vocab_size,
        tar_vocab_size,
        batch_size,
        num_layers=1,
        init_scale=0.1,
        dropout=None,
        beam_size=1,
        beam_start_token=1,
        beam_end_token=2,
        beam_max_step_num=2,
        mode='train',
    ):
526
        super().__init__()
527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548
        self.hidden_size = hidden_size
        self.src_vocab_size = src_vocab_size
        self.tar_vocab_size = tar_vocab_size
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.init_scale = init_scale
        self.dropout = dropout
        self.beam_size = beam_size
        self.beam_start_token = beam_start_token
        self.beam_end_token = beam_end_token
        self.beam_max_step_num = beam_max_step_num
        self.mode = mode
        self.kinf = 1e9

        param_attr = ParamAttr(initializer=uniform_initializer(self.init_scale))
        bias_attr = ParamAttr(initializer=zero_constant)
        forget_bias = 1.0

        self.src_embeder = Embedding(
            size=[self.src_vocab_size, self.hidden_size],
            param_attr=fluid.ParamAttr(
                name='source_embedding',
549 550 551
                initializer=uniform_initializer(init_scale),
            ),
        )
552 553 554 555 556 557

        self.tar_embeder = Embedding(
            size=[self.tar_vocab_size, self.hidden_size],
            is_sparse=False,
            param_attr=fluid.ParamAttr(
                name='target_embedding',
558 559 560
                initializer=uniform_initializer(init_scale),
            ),
        )
561 562 563 564 565 566

        self.enc_units = []
        for i in range(num_layers):
            self.enc_units.append(
                self.add_sublayer(
                    "enc_units_%d" % i,
567 568 569 570 571 572 573 574 575
                    BasicLSTMUnit(
                        hidden_size=self.hidden_size,
                        input_size=self.hidden_size,
                        param_attr=param_attr,
                        bias_attr=bias_attr,
                        forget_bias=forget_bias,
                    ),
                )
            )
576 577 578 579 580 581 582

        self.dec_units = []
        for i in range(num_layers):
            if i == 0:
                self.dec_units.append(
                    self.add_sublayer(
                        "dec_units_%d" % i,
583 584 585 586 587 588 589 590 591 592 593 594 595 596
                        BasicLSTMUnit(
                            hidden_size=self.hidden_size,
                            input_size=self.hidden_size * 2,
                            param_attr=ParamAttr(
                                name="dec_units_%d" % i,
                                initializer=uniform_initializer(
                                    self.init_scale
                                ),
                            ),
                            bias_attr=bias_attr,
                            forget_bias=forget_bias,
                        ),
                    )
                )
597 598 599 600
            else:
                self.dec_units.append(
                    self.add_sublayer(
                        "dec_units_%d" % i,
601 602 603 604 605 606 607 608 609 610 611 612 613 614
                        BasicLSTMUnit(
                            hidden_size=self.hidden_size,
                            input_size=self.hidden_size,
                            param_attr=ParamAttr(
                                name="dec_units_%d" % i,
                                initializer=uniform_initializer(
                                    self.init_scale
                                ),
                            ),
                            bias_attr=bias_attr,
                            forget_bias=forget_bias,
                        ),
                    )
                )
615 616 617 618

        self.attn_fc = fluid.dygraph.nn.Linear(
            self.hidden_size,
            self.hidden_size,
619 620 621 622 623 624
            param_attr=ParamAttr(
                name="self_attn_fc",
                initializer=uniform_initializer(self.init_scale),
            ),
            bias_attr=False,
        )
625 626 627 628

        self.concat_fc = fluid.dygraph.nn.Linear(
            2 * self.hidden_size,
            self.hidden_size,
629 630 631 632 633 634 635 636 637 638 639 640 641 642 643
            param_attr=ParamAttr(
                name="self_concat_fc",
                initializer=uniform_initializer(self.init_scale),
            ),
            bias_attr=False,
        )

        self.fc = fluid.dygraph.nn.Linear(
            self.hidden_size,
            self.tar_vocab_size,
            param_attr=ParamAttr(
                name="self_fc", initializer=uniform_initializer(self.init_scale)
            ),
            bias_attr=False,
        )
644 645

    def _transpose_batch_time(self, x):
646
        return paddle.transpose(x, [1, 0] + list(range(2, len(x.shape))))
647 648

    def _merge_batch_beams(self, x):
649
        return paddle.reshape(x, shape=(-1, x.shape[2]))
650 651 652 653 654 655

    def tile_beam_merge_with_batch(self, x):
        x = fluid.layers.unsqueeze(x, [1])  # [batch_size, 1, ...]
        expand_times = [1] * len(x.shape)
        expand_times[1] = self.beam_size
        x = fluid.layers.expand(x, expand_times)  # [batch_size, beam_size, ...]
656
        x = paddle.transpose(
657 658
            x, list(range(2, len(x.shape))) + [0, 1]
        )  # [..., batch_size, beam_size]
659
        # use 0 to copy to avoid wrong shape
660
        x = paddle.reshape(
661 662
            x, shape=[0] * (len(x.shape) - 2) + [-1]
        )  # [..., batch_size * beam_size]
663
        x = paddle.transpose(
664 665
            x, [len(x.shape) - 1] + list(range(0, len(x.shape) - 1))
        )  # [batch_size * beam_size, ...]
666 667 668
        return x

    def _split_batch_beams(self, x):
669
        return paddle.reshape(x, shape=(-1, self.beam_size, x.shape[1]))
670 671 672 673 674 675 676 677 678

    def _expand_to_beam_size(self, x):
        x = fluid.layers.unsqueeze(x, [1])
        expand_times = [1] * len(x.shape)
        expand_times[1] = self.beam_size
        x = fluid.layers.expand(x, expand_times)
        return x

    def _real_state(self, state, new_state, step_mask):
679 680 681
        new_state = fluid.layers.elementwise_mul(
            new_state, step_mask, axis=0
        ) - fluid.layers.elementwise_mul(state, (step_mask - 1), axis=0)
682 683 684
        return new_state

    def _gather(self, x, indices, batch_pos):
685
        topk_coordinates = paddle.stack([batch_pos, indices], axis=2)
686 687 688 689 690 691 692 693
        return fluid.layers.gather_nd(x, topk_coordinates)

    def attention(self, query, enc_output, mask=None):
        query = fluid.layers.unsqueeze(query, [1])
        memory = self.attn_fc(enc_output)
        attn = fluid.layers.matmul(query, memory, transpose_y=True)

        if mask is not None:
694
            attn = paddle.transpose(attn, [1, 0, 2])
695
            attn = fluid.layers.elementwise_add(attn, mask * 1000000000, -1)
696
            attn = paddle.transpose(attn, [1, 0, 2])
697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
        weight = fluid.layers.softmax(attn)
        weight_memory = fluid.layers.matmul(weight, memory)

        return weight_memory

    def _change_size_for_array(self, func, array):
        print(" ^" * 10, "_change_size_for_array")
        print("array : ", array)
        for i, state in enumerate(array):
            fluid.layers.array_write(func(state), i, array)

        return array

    @declarative
    def forward(self, inputs):
        src, tar, label, src_sequence_length, tar_sequence_length = inputs
        if src.shape[0] < self.batch_size:
            self.batch_size = src.shape[0]

        src_emb = self.src_embeder(self._transpose_batch_time(src))

        # NOTE: modify model code about `enc_hidden` and `enc_cell` to transforme dygraph code successfully.
        # Because nested list can't be transformed now.
        enc_hidden_0 = to_variable(
721 722
            np.zeros((self.batch_size, self.hidden_size), dtype='float32')
        )
723 724
        enc_hidden_0.stop_gradient = True
        enc_cell_0 = to_variable(
725 726
            np.zeros((self.batch_size, self.hidden_size), dtype='float32')
        )
727 728 729 730 731 732
        enc_hidden_0.stop_gradient = True
        zero = fluid.layers.zeros(shape=[1], dtype="int64")
        enc_hidden = fluid.layers.create_array(dtype="float32")
        enc_cell = fluid.layers.create_array(dtype="float32")
        for i in range(self.num_layers):
            index = zero + i
733 734 735 736 737 738
            enc_hidden = fluid.layers.array_write(
                enc_hidden_0, index, array=enc_hidden
            )
            enc_cell = fluid.layers.array_write(
                enc_cell_0, index, array=enc_cell
            )
739 740 741

        max_seq_len = src_emb.shape[0]

742 743 744 745
        enc_len_mask = fluid.layers.sequence_mask(
            src_sequence_length, maxlen=max_seq_len, dtype="float32"
        )
        enc_padding_mask = enc_len_mask - 1.0
746
        enc_len_mask = paddle.transpose(enc_len_mask, [1, 0])
747 748 749 750 751 752 753 754 755 756 757

        enc_outputs = []
        # TODO: Because diff exits if call while_loop in static graph.
        # In while block, a Variable created in parent block participates in the calculation of gradient,
        # the gradient is wrong because each step scope always returns the same value generated by last step.
        for p in range(max_seq_len):
            k = 0 + p
            enc_step_input = src_emb[k]
            step_mask = enc_len_mask[k]
            new_enc_hidden, new_enc_cell = [], []
            for i in range(self.num_layers):
758 759 760
                enc_new_hidden, enc_new_cell = self.enc_units[i](
                    enc_step_input, enc_hidden[i], enc_cell[i]
                )
761
                if self.dropout is not None and self.dropout > 0.0:
762 763 764
                    enc_step_input = fluid.layers.dropout(
                        enc_new_hidden,
                        dropout_prob=self.dropout,
765 766
                        dropout_implementation='upscale_in_train',
                    )
767 768 769 770
                else:
                    enc_step_input = enc_new_hidden

                new_enc_hidden.append(
771 772
                    self._real_state(enc_hidden[i], enc_new_hidden, step_mask)
                )
773
                new_enc_cell.append(
774 775
                    self._real_state(enc_cell[i], enc_new_cell, step_mask)
                )
776 777 778
            enc_outputs.append(enc_step_input)
            enc_hidden, enc_cell = new_enc_hidden, new_enc_cell

779
        enc_outputs = paddle.stack(enc_outputs)
780 781 782 783
        enc_outputs = self._transpose_batch_time(enc_outputs)

        # train
        input_feed = to_variable(
784 785
            np.zeros((self.batch_size, self.hidden_size), dtype='float32')
        )
786 787 788 789 790 791 792 793 794 795 796 797 798
        # NOTE: set stop_gradient here, otherwise grad var is null
        input_feed.stop_gradient = True
        dec_hidden, dec_cell = enc_hidden, enc_cell
        tar_emb = self.tar_embeder(self._transpose_batch_time(tar))
        max_seq_len = tar_emb.shape[0]
        dec_output = []

        for step_idx in range(max_seq_len):
            j = step_idx + 0
            step_input = tar_emb[j]
            step_input = fluid.layers.concat([step_input, input_feed], 1)
            new_dec_hidden, new_dec_cell = [], []
            for i in range(self.num_layers):
799 800 801
                new_hidden, new_cell = self.dec_units[i](
                    step_input, dec_hidden[i], dec_cell[i]
                )
802 803
                new_dec_hidden.append(new_hidden)
                new_dec_cell.append(new_cell)
804
                if self.dropout is not None and self.dropout > 0.0:
805 806 807
                    step_input = fluid.layers.dropout(
                        new_hidden,
                        dropout_prob=self.dropout,
808 809
                        dropout_implementation='upscale_in_train',
                    )
810 811 812 813 814 815 816 817 818 819
                else:
                    step_input = new_hidden
            dec_att = self.attention(step_input, enc_outputs, enc_padding_mask)
            dec_att = fluid.layers.squeeze(dec_att, [1])
            concat_att_out = fluid.layers.concat([dec_att, step_input], 1)
            out = self.concat_fc(concat_att_out)
            input_feed = out
            dec_output.append(out)
            dec_hidden, dec_cell = new_dec_hidden, new_dec_cell

820
        dec_output = paddle.stack(dec_output)
821
        dec_output = self.fc(self._transpose_batch_time(dec_output))
822 823 824
        loss = fluid.layers.softmax_with_cross_entropy(
            logits=dec_output, label=label, soft_label=False
        )
825 826
        loss = fluid.layers.squeeze(loss, axes=[2])
        max_tar_seq_len = fluid.layers.shape(tar)[1]
827 828 829
        tar_mask = fluid.layers.sequence_mask(
            tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32'
        )
830 831 832 833 834
        loss = loss * tar_mask
        loss = fluid.layers.reduce_mean(loss, dim=[0])
        loss = fluid.layers.reduce_sum(loss)

        return loss