dist_transformer.py 56.3 KB
Newer Older
X
Xin Pan 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import functools
G
gongweibao 已提交
16
import glob
17
import os
G
gongweibao 已提交
18 19
import random
import tarfile
20 21 22 23 24 25
import time
from functools import partial
from os.path import expanduser

import numpy as np
from test_dist_base import RUN_STEP, TestDistRunnerBase, runtime_main
X
Xin Pan 已提交
26

27
import paddle
X
Xin Pan 已提交
28
import paddle.fluid as fluid
G
gongweibao 已提交
29
import paddle.fluid.layers as layers
30
import paddle.nn.functional as F
G
gongweibao 已提交
31 32 33

const_para_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(0.001))
const_bias_attr = const_para_attr
X
Xin Pan 已提交
34 35 36 37 38 39

# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1


40
# from transformer_config import ModelHyperParams, TrainTaskConfig, merge_cfg_from_list
41
class TrainTaskConfig:
G
gongweibao 已提交
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
    # only support GPU currently
    use_gpu = True
    # the epoch number to train.
    pass_num = 1
    # the number of sequences contained in a mini-batch.
    # deprecated, set batch_size in args.
    batch_size = 20
    # the hyper parameters for Adam optimizer.
    # This static learning_rate will be multiplied to the LearningRateScheduler
    # derived learning rate the to get the final learning rate.
    learning_rate = 1
    beta1 = 0.9
    beta2 = 0.98
    eps = 1e-9
    # the parameters for learning rate scheduling.
    warmup_steps = 4000
    # the weight used to mix up the ground-truth distribution and the fixed
    # uniform distribution in label smoothing when training.
    # Set this as zero if label smoothing is not wanted.
    label_smooth_eps = 0.1
    # the directory for saving trained models.
    model_dir = "trained_models"
    # the directory for saving checkpoints.
    ckpt_dir = "trained_ckpts"
    # the directory for loading checkpoint.
    # If provided, continue training from the checkpoint.
    ckpt_path = None
    # the parameter to initialize the learning rate scheduler.
    # It should be provided if use checkpoints, since the checkpoint doesn't
    # include the training step counter currently.
    start_step = 0
X
Xin Pan 已提交
73

G
gongweibao 已提交
74
    check_acc = True
X
Xin Pan 已提交
75

G
gongweibao 已提交
76
    data_path = expanduser("~") + (
77 78
        "/.cache/paddle/dataset/test_dist_transformer/"
    )
G
gongweibao 已提交
79 80 81
    src_vocab_fpath = data_path + "vocab.bpe.32000"
    trg_vocab_fpath = data_path + "vocab.bpe.32000"
    train_file_pattern = data_path + "train.tok.clean.bpe.32000.en-de"
W
Wu Yi 已提交
82
    val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de.cut"
G
gongweibao 已提交
83 84 85 86 87 88 89 90
    pool_size = 2000
    sort_type = None
    local = True
    shuffle = False
    shuffle_batch = False
    special_token = ['<s>', '<e>', '<unk>']
    token_delimiter = ' '
    use_token_batch = False
X
Xin Pan 已提交
91 92


93
class InferTaskConfig:
G
gongweibao 已提交
94 95 96 97 98 99 100 101 102 103 104 105 106 107
    use_gpu = True
    # the number of examples in one run for sequence generation.
    batch_size = 10
    # the parameters for beam search.
    beam_size = 5
    max_out_len = 256
    # the number of decoded sentences to output.
    n_best = 1
    # the flags indicating whether to output the special tokens.
    output_bos = False
    output_eos = False
    output_unk = True
    # the directory for loading the trained model.
    model_path = "trained_models/pass_1.infer.model"
X
Xin Pan 已提交
108 109


110
class ModelHyperParams:
G
gongweibao 已提交
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
    # These following five vocabularies related configurations will be set
    # automatically according to the passed vocabulary path and special tokens.
    # size of source word dictionary.
    src_vocab_size = 10000
    # size of target word dictionay
    trg_vocab_size = 10000
    # index for <bos> token
    bos_idx = 0
    # index for <eos> token
    eos_idx = 1
    # index for <unk> token
    unk_idx = 2
    # max length of sequences deciding the size of position encoding table.
    # Start from 1 and count start and end tokens in.
    max_length = 256
X
Xin Pan 已提交
126 127 128 129 130
    # the dimension for word embeddings, which is also the last dimension of
    # the input and output of multi-head attention, position-wise feed-forward
    # networks, encoder and decoder.
    d_model = 512
    # size of the hidden layer in position-wise feed-forward networks.
G
gongweibao 已提交
131
    d_inner_hid = 2048
X
Xin Pan 已提交
132 133 134 135 136 137 138 139 140
    # the dimension that keys are projected to for dot-product attention.
    d_key = 64
    # the dimension that values are projected to for dot-product attention.
    d_value = 64
    # number of head used in multi-head attention.
    n_head = 8
    # number of sub-layers to be stacked in the encoder and decoder.
    n_layer = 6
    # dropout rate used by all dropout layers.
G
gongweibao 已提交
141 142 143 144 145 146
    dropout = 0.0  # no random
    # random seed used in dropout for CE.
    dropout_seed = None
    # the flag indicating whether to share embedding and softmax weights.
    # vocabularies in source and target should be same for weight sharing.
    weight_sharing = True
X
Xin Pan 已提交
147 148


G
gongweibao 已提交
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
def merge_cfg_from_list(cfg_list, g_cfgs):
    """
    Set the above global configurations using the cfg_list.
    """
    assert len(cfg_list) % 2 == 0
    for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
        for g_cfg in g_cfgs:
            if hasattr(g_cfg, key):
                try:
                    value = eval(value)
                except Exception:  # for file path
                    pass
                setattr(g_cfg, key, value)
                break


# The placeholder for batch_size in compile time. Must be -1 currently to be
# consistent with some ops' infer-shape output in compile time, such as the
# sequence_expand op used in beamsearch decoder.
batch_size = -1
# The placeholder for squence length in compile time.
seq_len = ModelHyperParams.max_length
# Here list the data shapes and data types of all inputs.
# The shapes here act as placeholder and are set to pass the infer-shape in
# compile time.
input_descs = {
    # The actual data shape of src_word is:
    # [batch_size * max_src_len_in_batch, 1]
177
    "src_word": [(batch_size, seq_len, 1), "int64", 2],
G
gongweibao 已提交
178 179
    # The actual data shape of src_pos is:
    # [batch_size * max_src_len_in_batch, 1]
180
    "src_pos": [(batch_size, seq_len, 1), "int64"],
G
gongweibao 已提交
181 182 183 184
    # This input is used to remove attention weights on paddings in the
    # encoder.
    # The actual data shape of src_slf_attn_bias is:
    # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
185 186 187 188
    "src_slf_attn_bias": [
        (batch_size, ModelHyperParams.n_head, seq_len, seq_len),
        "float32",
    ],
G
gongweibao 已提交
189 190
    # The actual data shape of trg_word is:
    # [batch_size * max_trg_len_in_batch, 1]
191 192 193 194 195
    "trg_word": [
        (batch_size, seq_len, 1),
        "int64",
        2,
    ],  # lod_level is only used in fast decoder.
G
gongweibao 已提交
196 197
    # The actual data shape of trg_pos is:
    # [batch_size * max_trg_len_in_batch, 1]
198
    "trg_pos": [(batch_size, seq_len, 1), "int64"],
G
gongweibao 已提交
199 200 201 202
    # This input is used to remove attention weights on paddings and
    # subsequent words in the decoder.
    # The actual data shape of trg_slf_attn_bias is:
    # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
203 204 205 206
    "trg_slf_attn_bias": [
        (batch_size, ModelHyperParams.n_head, seq_len, seq_len),
        "float32",
    ],
G
gongweibao 已提交
207 208 209 210
    # This input is used to remove attention weights on paddings of the source
    # input in the encoder-decoder attention.
    # The actual data shape of trg_src_attn_bias is:
    # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
211 212 213 214
    "trg_src_attn_bias": [
        (batch_size, ModelHyperParams.n_head, seq_len, seq_len),
        "float32",
    ],
G
gongweibao 已提交
215 216 217 218 219 220
    # This input is used in independent decoder program for inference.
    # The actual data shape of enc_output is:
    # [batch_size, max_src_len_in_batch, d_model]
    "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
    # The actual data shape of label_word is:
    # [batch_size * max_trg_len_in_batch, 1]
221
    "lbl_word": [(batch_size * seq_len, 1), "int64"],
T
tianshuo78520a 已提交
222
    # This input is used to mask out the loss of padding tokens.
G
gongweibao 已提交
223 224
    # The actual data shape of label_weight is:
    # [batch_size * max_trg_len_in_batch, 1]
225
    "lbl_weight": [(batch_size * seq_len, 1), "float32"],
G
gongweibao 已提交
226
    # These inputs are used to change the shape tensor in beam-search decoder.
227 228
    "trg_slf_attn_pre_softmax_shape_delta": [(2,), "int32"],
    "trg_slf_attn_post_softmax_shape_delta": [(4,), "int32"],
229
    "init_score": [(batch_size, 1), "float32"],
G
gongweibao 已提交
230 231 232 233 234
}

# Names of word embedding table which might be reused for weight sharing.
word_emb_param_names = (
    "src_word_emb_table",
235 236
    "trg_word_emb_table",
)
G
gongweibao 已提交
237 238 239
# Names of position encoding table which will be initialized externally.
pos_enc_param_names = (
    "src_pos_enc_table",
240 241
    "trg_pos_enc_table",
)
G
gongweibao 已提交
242 243 244 245
# separated inputs for different usages.
encoder_data_input_fields = (
    "src_word",
    "src_pos",
246 247
    "src_slf_attn_bias",
)
G
gongweibao 已提交
248 249 250 251 252
decoder_data_input_fields = (
    "trg_word",
    "trg_pos",
    "trg_slf_attn_bias",
    "trg_src_attn_bias",
253 254
    "enc_output",
)
G
gongweibao 已提交
255 256
label_data_input_fields = (
    "lbl_word",
257 258
    "lbl_weight",
)
G
gongweibao 已提交
259 260 261 262 263
# In fast decoder, trg_pos (only containing the current time step) is generated
# by ops and trg_slf_attn_bias is not needed.
fast_decoder_data_input_fields = (
    "trg_word",
    "init_score",
264 265
    "trg_src_attn_bias",
)
G
gongweibao 已提交
266 267 268 269 270 271

# fast_decoder_util_input_fields = (
#     "trg_slf_attn_pre_softmax_shape_delta",
#     "trg_slf_attn_post_softmax_shape_delta", )


272
# from optim import LearningRateScheduler
273
class LearningRateScheduler:
G
gongweibao 已提交
274 275 276
    """
    Wrapper for learning rate scheduling as described in the Transformer paper.
    LearningRateScheduler adapts the learning rate externally and the adapted
T
tianshuo78520a 已提交
277
    learning rate will be fed into the main_program as input data.
G
gongweibao 已提交
278 279
    """

280 281 282 283 284 285 286 287
    def __init__(
        self,
        d_model,
        warmup_steps,
        learning_rate=0.001,
        current_steps=0,
        name="learning_rate",
    ):
G
gongweibao 已提交
288 289 290 291
        self.current_steps = current_steps
        self.warmup_steps = warmup_steps
        self.d_model = d_model
        self.static_lr = learning_rate
292
        self.learning_rate = paddle.static.create_global_var(
G
gongweibao 已提交
293 294 295 296
            name=name,
            shape=[1],
            value=float(learning_rate),
            dtype="float32",
297 298
            persistable=True,
        )
G
gongweibao 已提交
299 300 301

    def update_learning_rate(self):
        self.current_steps += 1
302 303 304 305 306 307 308 309 310 311
        lr_value = (
            np.power(self.d_model, -0.5)
            * np.min(
                [
                    np.power(self.current_steps, -0.5),
                    np.power(self.warmup_steps, -1.5) * self.current_steps,
                ]
            )
            * self.static_lr
        )
G
gongweibao 已提交
312 313 314
        return np.array([lr_value], dtype="float32")


315 316 317 318 319 320 321 322 323 324 325
# from transformer_train import train_loop
def pad_batch_data(
    insts,
    pad_idx,
    n_head,
    is_target=False,
    is_label=False,
    return_attn_bias=True,
    return_max_len=True,
    return_num_token=False,
):
X
Xin Pan 已提交
326 327
    """
    Pad the instances to the max sequence length in batch, and generate the
G
gongweibao 已提交
328
    corresponding position data and attention bias.
X
Xin Pan 已提交
329
    """
G
gongweibao 已提交
330 331
    return_list = []
    max_len = max(len(inst) for inst in insts)
332 333 334 335 336
    num_token = (
        functools.reduce(lambda x, y: x + y, [len(inst) for inst in insts])
        if return_num_token
        else 0
    )
G
gongweibao 已提交
337 338 339
    # Any token included in dict can be used to pad, since the paddings' loss
    # will be masked out by weights and make no effect on parameter gradients.
    inst_data = np.array(
340 341
        [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]
    )
G
gongweibao 已提交
342 343
    return_list += [inst_data.astype("int64").reshape([-1, 1])]
    if is_label:  # label weight
344 345 346 347 348 349
        inst_weight = np.array(
            [
                [1.0] * len(inst) + [0.0] * (max_len - len(inst))
                for inst in insts
            ]
        )
G
gongweibao 已提交
350 351
        return_list += [inst_weight.astype("float32").reshape([-1, 1])]
    else:  # position data
352 353 354 355 356 357
        inst_pos = np.array(
            [
                list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst))
                for inst in insts
            ]
        )
G
gongweibao 已提交
358 359 360 361 362 363
        return_list += [inst_pos.astype("int64").reshape([-1, 1])]
    if return_attn_bias:
        if is_target:
            # This is used to avoid attention on paddings and subsequent
            # words.
            slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, max_len))
364 365 366 367 368 369
            slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
                [-1, 1, max_len, max_len]
            )
            slf_attn_bias_data = np.tile(
                slf_attn_bias_data, [1, n_head, 1, 1]
            ) * [-1e9]
G
gongweibao 已提交
370 371
        else:
            # This is used to avoid attention on paddings.
372 373 374 375 376 377
            slf_attn_bias_data = np.array(
                [
                    [0] * len(inst) + [-1e9] * (max_len - len(inst))
                    for inst in insts
                ]
            )
G
gongweibao 已提交
378 379
            slf_attn_bias_data = np.tile(
                slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
380 381
                [1, n_head, max_len, 1],
            )
G
gongweibao 已提交
382 383 384 385 386 387 388 389
        return_list += [slf_attn_bias_data.astype("float32")]
    if return_max_len:
        return_list += [max_len]
    if return_num_token:
        return_list += [num_token]
    return return_list if len(return_list) > 1 else return_list[0]


390 391 392
def prepare_batch_input(
    insts, data_input_names, src_pad_idx, trg_pad_idx, n_head, d_model
):
G
gongweibao 已提交
393 394 395 396
    """
    Put all padded data needed by training into a dict.
    """
    src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data(
397 398
        [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False
    )
G
gongweibao 已提交
399 400 401
    src_word = src_word.reshape(-1, src_max_len, 1)
    src_pos = src_pos.reshape(-1, src_max_len, 1)
    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data(
402 403
        [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True
    )
G
gongweibao 已提交
404 405
    trg_word = trg_word.reshape(-1, trg_max_len, 1)
    trg_pos = trg_pos.reshape(-1, trg_max_len, 1)
X
Xin Pan 已提交
406

407 408 409
    trg_src_attn_bias = np.tile(
        src_slf_attn_bias[:, :, ::src_max_len, :], [1, 1, trg_max_len, 1]
    ).astype("float32")
X
Xin Pan 已提交
410

G
gongweibao 已提交
411 412 413 414 415 416 417 418
    lbl_word, lbl_weight, num_token = pad_batch_data(
        [inst[2] for inst in insts],
        trg_pad_idx,
        n_head,
        is_target=False,
        is_label=True,
        return_attn_bias=False,
        return_max_len=False,
419 420
        return_num_token=True,
    )
G
gongweibao 已提交
421 422

    data_input_dict = dict(
M
minqiyang 已提交
423
        list(
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439
            zip(
                data_input_names,
                [
                    src_word,
                    src_pos,
                    src_slf_attn_bias,
                    trg_word,
                    trg_pos,
                    trg_slf_attn_bias,
                    trg_src_attn_bias,
                    lbl_word,
                    lbl_weight,
                ],
            )
        )
    )
G
gongweibao 已提交
440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463
    return data_input_dict, np.asarray([num_token], dtype="float32")


def read_multiple(reader, count, clip_last=True):
    """
    Stack data from reader for multi-devices.
    """

    def __impl__():
        res = []
        for item in reader():
            res.append(item)
            if len(res) == count:
                yield res
                res = []
        if len(res) == count:
            yield res
        elif not clip_last:
            data = []
            for item in res:
                data += item
            if len(data) > count:
                inst_num_per_part = len(data) // count
                yield [
464
                    data[inst_num_per_part * i : inst_num_per_part * (i + 1)]
G
gongweibao 已提交
465 466 467 468 469 470 471 472 473 474 475 476 477 478
                    for i in range(count)
                ]

    return __impl__


def split_data(data, num_part):
    """
    Split data for each device.
    """
    if len(data) == num_part:
        return data
    data = data[0]
    inst_num_per_part = len(data) // num_part
X
Xin Pan 已提交
479
    return [
480
        data[inst_num_per_part * i : inst_num_per_part * (i + 1)]
G
gongweibao 已提交
481
        for i in range(num_part)
X
Xin Pan 已提交
482 483 484
    ]


485 486 487 488 489 490 491 492 493
def test_context(
    test_program,
    avg_cost,
    train_exe,
    dev_count,
    data_input_names,
    sum_cost,
    token_num,
):
G
gongweibao 已提交
494 495 496 497 498 499
    val_data = DataReader(
        src_vocab_fpath=TrainTaskConfig.src_vocab_fpath,
        trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath,
        fpattern=TrainTaskConfig.val_file_pattern,
        token_delimiter=TrainTaskConfig.token_delimiter,
        use_token_batch=TrainTaskConfig.use_token_batch,
500 501
        batch_size=TrainTaskConfig.batch_size
        * (1 if TrainTaskConfig.use_token_batch else dev_count),
G
gongweibao 已提交
502 503 504 505 506 507 508 509 510
        pool_size=TrainTaskConfig.pool_size,
        sort_type=TrainTaskConfig.sort_type,
        start_mark=TrainTaskConfig.special_token[0],
        end_mark=TrainTaskConfig.special_token[1],
        unk_mark=TrainTaskConfig.special_token[2],
        # count start and end tokens out
        max_length=ModelHyperParams.max_length - 2,
        clip_last_batch=False,
        shuffle=False,
511 512
        shuffle_batch=False,
    )
G
gongweibao 已提交
513 514 515 516 517 518

    build_strategy = fluid.BuildStrategy()

    strategy = fluid.ExecutionStrategy()
    strategy.num_threads = 1

519 520 521 522 523 524 525
    test_exe = fluid.ParallelExecutor(
        use_cuda=TrainTaskConfig.use_gpu,
        main_program=test_program,
        share_vars_from=train_exe,
        build_strategy=build_strategy,
        exec_strategy=strategy,
    )
G
gongweibao 已提交
526 527 528 529 530 531

    def test(exe=test_exe):
        test_total_cost = 0
        test_total_token = 0
        test_data = read_multiple(
            reader=val_data.batch_generator,
532 533
            count=dev_count if TrainTaskConfig.use_token_batch else 1,
        )
G
gongweibao 已提交
534 535 536
        for batch_id, data in enumerate(test_data()):
            feed_list = []
            for place_id, data_buffer in enumerate(
537 538
                split_data(data, num_part=dev_count)
            ):
G
gongweibao 已提交
539
                data_input_dict, _ = prepare_batch_input(
540 541 542 543 544 545 546
                    data_buffer,
                    data_input_names,
                    ModelHyperParams.eos_idx,
                    ModelHyperParams.eos_idx,
                    ModelHyperParams.n_head,
                    ModelHyperParams.d_model,
                )
G
gongweibao 已提交
547 548
                feed_list.append(data_input_dict)

549 550 551
            outs = exe.run(
                feed=feed_list, fetch_list=[sum_cost.name, token_num.name]
            )
G
gongweibao 已提交
552 553 554 555 556 557 558 559 560 561
            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
            test_total_cost += sum_cost_val.sum()
            test_total_token += token_num_val.sum()
        test_avg_cost = test_total_cost / test_total_token
        test_ppl = np.exp([min(test_avg_cost, 100)])
        return test_avg_cost, test_ppl

    return test


562 563 564 565 566 567 568 569 570 571 572
def train_loop(
    exe,
    train_progm,
    dev_count,
    sum_cost,
    avg_cost,
    lr_scheduler,
    token_num,
    predict,
    test_program,
):
G
gongweibao 已提交
573 574 575 576 577 578 579 580 581 582 583 584
    # Initialize the parameters.
    if TrainTaskConfig.ckpt_path:
        lr_scheduler.current_steps = TrainTaskConfig.start_step
    else:
        exe.run(fluid.framework.default_startup_program())

    train_data = DataReader(
        src_vocab_fpath=TrainTaskConfig.src_vocab_fpath,
        trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath,
        fpattern=TrainTaskConfig.train_file_pattern,
        token_delimiter=TrainTaskConfig.token_delimiter,
        use_token_batch=TrainTaskConfig.use_token_batch,
585 586
        batch_size=TrainTaskConfig.batch_size
        * (1 if TrainTaskConfig.use_token_batch else dev_count),
G
gongweibao 已提交
587 588 589 590 591 592 593 594 595
        pool_size=TrainTaskConfig.pool_size,
        sort_type=TrainTaskConfig.sort_type,
        shuffle=TrainTaskConfig.shuffle,
        shuffle_batch=TrainTaskConfig.shuffle_batch,
        start_mark=TrainTaskConfig.special_token[0],
        end_mark=TrainTaskConfig.special_token[1],
        unk_mark=TrainTaskConfig.special_token[2],
        # count start and end tokens out
        max_length=ModelHyperParams.max_length - 2,
596 597
        clip_last_batch=False,
    )
G
gongweibao 已提交
598 599
    train_data = read_multiple(
        reader=train_data.batch_generator,
600 601
        count=dev_count if TrainTaskConfig.use_token_batch else 1,
    )
G
gongweibao 已提交
602 603 604 605 606

    build_strategy = fluid.BuildStrategy()
    # Since the token number differs among devices, customize gradient scale to
    # use token average cost among multi-devices. and the gradient scale is
    # `1 / token_number` for average cost.
607 608 609
    build_strategy.gradient_scale_strategy = (
        fluid.BuildStrategy.GradientScaleStrategy.Customized
    )
G
gongweibao 已提交
610 611 612 613

    strategy = fluid.ExecutionStrategy()
    strategy.num_threads = 1

614 615 616 617 618 619 620
    train_exe = fluid.ParallelExecutor(
        use_cuda=TrainTaskConfig.use_gpu,
        loss_name=sum_cost.name,
        main_program=train_progm,
        build_strategy=build_strategy,
        exec_strategy=strategy,
    )
G
gongweibao 已提交
621

622 623 624 625 626
    data_input_names = (
        encoder_data_input_fields
        + decoder_data_input_fields[:-1]
        + label_data_input_fields
    )
G
gongweibao 已提交
627 628

    if TrainTaskConfig.val_file_pattern is not None:
629 630 631 632 633 634 635 636 637
        test = test_context(
            test_program,
            avg_cost,
            train_exe,
            dev_count,
            data_input_names,
            sum_cost,
            token_num,
        )
G
gongweibao 已提交
638 639

    # the best cross-entropy value with label smoothing
640 641 642 643 644 645 646 647 648 649
    loss_normalizer = -(
        (1.0 - TrainTaskConfig.label_smooth_eps)
        * np.log((1.0 - TrainTaskConfig.label_smooth_eps))
        + TrainTaskConfig.label_smooth_eps
        * np.log(
            TrainTaskConfig.label_smooth_eps
            / (ModelHyperParams.trg_vocab_size - 1)
            + 1e-20
        )
    )
G
gongweibao 已提交
650
    init = False
651
    for pass_id in range(TrainTaskConfig.pass_num):
G
gongweibao 已提交
652 653
        pass_start_time = time.time()
        for batch_id, data in enumerate(train_data()):
G
gongweibao 已提交
654
            if batch_id >= RUN_STEP:
G
gongweibao 已提交
655 656 657 658 659 660 661 662 663
                break

            feed_list = []
            total_num_token = 0

            if TrainTaskConfig.local:
                lr_rate = lr_scheduler.update_learning_rate()

            for place_id, data_buffer in enumerate(
664 665
                split_data(data, num_part=dev_count)
            ):
G
gongweibao 已提交
666
                data_input_dict, num_token = prepare_batch_input(
667 668 669 670 671 672 673
                    data_buffer,
                    data_input_names,
                    ModelHyperParams.eos_idx,
                    ModelHyperParams.eos_idx,
                    ModelHyperParams.n_head,
                    ModelHyperParams.d_model,
                )
G
gongweibao 已提交
674
                total_num_token += num_token
M
minqiyang 已提交
675
                feed_kv_pairs = list(data_input_dict.items())
G
gongweibao 已提交
676
                if TrainTaskConfig.local:
677
                    feed_kv_pairs += list(
678 679
                        {lr_scheduler.learning_rate.name: lr_rate}.items()
                    )
G
gongweibao 已提交
680 681 682 683 684 685
                feed_list.append(dict(feed_kv_pairs))

                if not init:
                    for pos_enc_param_name in pos_enc_param_names:
                        pos_enc = position_encoding_init(
                            ModelHyperParams.max_length + 1,
686 687
                            ModelHyperParams.d_model,
                        )
G
gongweibao 已提交
688 689 690 691
                        feed_list[place_id][pos_enc_param_name] = pos_enc

            if not TrainTaskConfig.check_acc:
                for feed_dict in feed_list:
692
                    feed_dict[sum_cost.name + "@GRAD"] = 1.0 / total_num_token
G
gongweibao 已提交
693 694 695 696
            else:
                b = 100 * TrainTaskConfig.batch_size
                a = np.asarray([b], dtype="float32")
                for feed_dict in feed_list:
697
                    feed_dict[sum_cost.name + "@GRAD"] = 1.0 / a
G
gongweibao 已提交
698

699 700 701
            outs = train_exe.run(
                fetch_list=[sum_cost.name, token_num.name], feed=feed_list
            )
G
gongweibao 已提交
702 703 704 705 706 707 708 709 710

            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
            total_sum_cost = sum_cost_val.sum()
            total_token_num = token_num_val.sum()
            total_avg_cost = total_sum_cost / total_token_num

            init = True

            # Validate and save the model for inference.
G
gongweibao 已提交
711 712 713 714
            if TrainTaskConfig.val_file_pattern is not None:
                val_avg_cost, val_ppl = test()
                print("[%f]" % val_avg_cost)
            else:
715
                assert False
G
gongweibao 已提交
716 717


718
# import transformer_reader as reader
719
class SortType:
G
gongweibao 已提交
720 721 722 723 724
    GLOBAL = 'global'
    POOL = 'pool'
    NONE = "none"


725
class Converter:
G
gongweibao 已提交
726 727 728 729 730 731 732 733
    def __init__(self, vocab, beg, end, unk, delimiter):
        self._vocab = vocab
        self._beg = beg
        self._end = end
        self._unk = unk
        self._delimiter = delimiter

    def __call__(self, sentence):
734 735 736 737 738 739 740 741
        return (
            [self._beg]
            + [
                self._vocab.get(w, self._unk)
                for w in sentence.split(self._delimiter)
            ]
            + [self._end]
        )
G
gongweibao 已提交
742 743


744
class ComposedConverter:
G
gongweibao 已提交
745 746 747 748 749 750 751 752 753 754
    def __init__(self, converters):
        self._converters = converters

    def __call__(self, parallel_sentence):
        return [
            self._converters[i](parallel_sentence[i])
            for i in range(len(self._converters))
        ]


755
class SentenceBatchCreator:
G
gongweibao 已提交
756 757 758 759 760 761 762 763 764 765 766 767
    def __init__(self, batch_size):
        self.batch = []
        self._batch_size = batch_size

    def append(self, info):
        self.batch.append(info)
        if len(self.batch) == self._batch_size:
            tmp = self.batch
            self.batch = []
            return tmp


768
class TokenBatchCreator:
G
gongweibao 已提交
769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
    def __init__(self, batch_size):
        self.batch = []
        self.max_len = -1
        self._batch_size = batch_size

    def append(self, info):
        cur_len = info.max_len
        max_len = max(self.max_len, cur_len)
        if max_len * (len(self.batch) + 1) > self._batch_size:
            result = self.batch
            self.batch = [info]
            self.max_len = cur_len
            return result
        else:
            self.max_len = max_len
            self.batch.append(info)


787
class SampleInfo:
G
gongweibao 已提交
788 789 790 791 792 793
    def __init__(self, i, max_len, min_len):
        self.i = i
        self.min_len = min_len
        self.max_len = max_len


794
class MinMaxFilter:
G
gongweibao 已提交
795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810
    def __init__(self, max_len, min_len, underlying_creator):
        self._min_len = min_len
        self._max_len = max_len
        self._creator = underlying_creator

    def append(self, info):
        if info.max_len > self._max_len or info.min_len < self._min_len:
            return
        else:
            return self._creator.append(info)

    @property
    def batch(self):
        return self._creator.batch


811
class DataReader:
G
gongweibao 已提交
812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883
    """
    The data reader loads all data from files and produces batches of data
    in the way corresponding to settings.

    An example of returning a generator producing data batches whose data
    is shuffled in each pass and sorted in each pool:

    ```
    train_data = DataReader(
        src_vocab_fpath='data/src_vocab_file',
        trg_vocab_fpath='data/trg_vocab_file',
        fpattern='data/part-*',
        use_token_batch=True,
        batch_size=2000,
        pool_size=10000,
        sort_type=SortType.POOL,
        shuffle=True,
        shuffle_batch=True,
        start_mark='<s>',
        end_mark='<e>',
        unk_mark='<unk>',
        clip_last_batch=False).batch_generator
    ```

    :param src_vocab_fpath: The path of vocabulary file of source language.
    :type src_vocab_fpath: basestring
    :param trg_vocab_fpath: The path of vocabulary file of target language.
    :type trg_vocab_fpath: basestring
    :param fpattern: The pattern to match data files.
    :type fpattern: basestring
    :param batch_size: The number of sequences contained in a mini-batch.
        or the maximum number of tokens (include paddings) contained in a
        mini-batch.
    :type batch_size: int
    :param pool_size: The size of pool buffer.
    :type pool_size: int
    :param sort_type: The grain to sort by length: 'global' for all
        instances; 'pool' for instances in pool; 'none' for no sort.
    :type sort_type: basestring
    :param clip_last_batch: Whether to clip the last uncompleted batch.
    :type clip_last_batch: bool
    :param tar_fname: The data file in tar if fpattern matches a tar file.
    :type tar_fname: basestring
    :param min_length: The minimum length used to filt sequences.
    :type min_length: int
    :param max_length: The maximum length used to filt sequences.
    :type max_length: int
    :param shuffle: Whether to shuffle all instances.
    :type shuffle: bool
    :param shuffle_batch: Whether to shuffle the generated batches.
    :type shuffle_batch: bool
    :param use_token_batch: Whether to produce batch data according to
        token number.
    :type use_token_batch: bool
    :param field_delimiter: The delimiter used to split source and target in
        each line of data file.
    :type field_delimiter: basestring
    :param token_delimiter: The delimiter used to split tokens in source or
        target sentences.
    :type token_delimiter: basestring
    :param start_mark: The token representing for the beginning of
        sentences in dictionary.
    :type start_mark: basestring
    :param end_mark: The token representing for the end of sentences
        in dictionary.
    :type end_mark: basestring
    :param unk_mark: The token representing for unknown word in dictionary.
    :type unk_mark: basestring
    :param seed: The seed for random.
    :type seed: int
    """

884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905
    def __init__(
        self,
        src_vocab_fpath,
        trg_vocab_fpath,
        fpattern,
        batch_size,
        pool_size,
        sort_type=SortType.GLOBAL,
        clip_last_batch=True,
        tar_fname=None,
        min_length=0,
        max_length=100,
        shuffle=True,
        shuffle_batch=False,
        use_token_batch=False,
        field_delimiter="\t",
        token_delimiter=" ",
        start_mark="<s>",
        end_mark="<e>",
        unk_mark="<unk>",
        seed=0,
    ):
G
gongweibao 已提交
906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921
        self._src_vocab = self.load_dict(src_vocab_fpath)
        self._only_src = True
        if trg_vocab_fpath is not None:
            self._trg_vocab = self.load_dict(trg_vocab_fpath)
            self._only_src = False
        self._pool_size = pool_size
        self._batch_size = batch_size
        self._use_token_batch = use_token_batch
        self._sort_type = sort_type
        self._clip_last_batch = clip_last_batch
        self._shuffle = shuffle
        self._shuffle_batch = shuffle_batch
        self._min_length = min_length
        self._max_length = max_length
        self._field_delimiter = field_delimiter
        self._token_delimiter = token_delimiter
922 923 924
        self.load_src_trg_ids(
            end_mark, fpattern, start_mark, tar_fname, unk_mark
        )
G
gongweibao 已提交
925 926
        self._random = random.Random(x=seed)

927 928 929
    def load_src_trg_ids(
        self, end_mark, fpattern, start_mark, tar_fname, unk_mark
    ):
G
gongweibao 已提交
930
        converters = [
931 932 933 934 935 936 937
            Converter(
                vocab=self._src_vocab,
                beg=self._src_vocab[start_mark],
                end=self._src_vocab[end_mark],
                unk=self._src_vocab[unk_mark],
                delimiter=self._token_delimiter,
            )
G
gongweibao 已提交
938 939 940
        ]
        if not self._only_src:
            converters.append(
941 942 943 944 945 946 947 948
                Converter(
                    vocab=self._trg_vocab,
                    beg=self._trg_vocab[start_mark],
                    end=self._trg_vocab[end_mark],
                    unk=self._trg_vocab[unk_mark],
                    delimiter=self._token_delimiter,
                )
            )
G
gongweibao 已提交
949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973

        converters = ComposedConverter(converters)

        self._src_seq_ids = []
        self._trg_seq_ids = None if self._only_src else []
        self._sample_infos = []

        for i, line in enumerate(self._load_lines(fpattern, tar_fname)):
            src_trg_ids = converters(line)
            self._src_seq_ids.append(src_trg_ids[0])
            lens = [len(src_trg_ids[0])]
            if not self._only_src:
                self._trg_seq_ids.append(src_trg_ids[1])
                lens.append(len(src_trg_ids[1]))
            self._sample_infos.append(SampleInfo(i, max(lens), min(lens)))

    def _load_lines(self, fpattern, tar_fname):
        fpaths = glob.glob(fpattern)

        if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]):
            if tar_fname is None:
                raise Exception("If tar file provided, please set tar_fname.")

            f = tarfile.open(fpaths[0], "r")
            for line in f.extractfile(tar_fname):
974
                line = line.decode()
G
gongweibao 已提交
975
                fields = line.strip("\n").split(self._field_delimiter)
976 977 978
                if (not self._only_src and len(fields) == 2) or (
                    self._only_src and len(fields) == 1
                ):
G
gongweibao 已提交
979 980 981 982 983 984
                    yield fields
        else:
            for fpath in fpaths:
                if not os.path.isfile(fpath):
                    raise IOError("Invalid file: %s" % fpath)

M
minqiyang 已提交
985
                with open(fpath, "rb") as f:
G
gongweibao 已提交
986
                    for line in f:
987
                        line = line.decode()
G
gongweibao 已提交
988
                        fields = line.strip("\n").split(self._field_delimiter)
989 990 991
                        if (not self._only_src and len(fields) == 2) or (
                            self._only_src and len(fields) == 1
                        ):
G
gongweibao 已提交
992 993 994 995 996
                            yield fields

    @staticmethod
    def load_dict(dict_path, reverse=False):
        word_dict = {}
M
minqiyang 已提交
997
        with open(dict_path, "rb") as fdict:
G
gongweibao 已提交
998
            for idx, line in enumerate(fdict):
999
                line = line.decode()
G
gongweibao 已提交
1000 1001 1002 1003 1004 1005 1006 1007 1008
                if reverse:
                    word_dict[idx] = line.strip("\n")
                else:
                    word_dict[line.strip("\n")] = idx
        return word_dict

    def batch_generator(self):
        # global sort or global shuffle
        if self._sort_type == SortType.GLOBAL:
1009 1010 1011
            infos = sorted(
                self._sample_infos, key=lambda x: x.max_len, reverse=True
            )
G
gongweibao 已提交
1012 1013 1014 1015 1016 1017 1018 1019 1020
        else:
            if self._shuffle:
                infos = self._sample_infos
                self._random.shuffle(infos)
            else:
                infos = self._sample_infos

            if self._sort_type == SortType.POOL:
                for i in range(0, len(infos), self._pool_size):
1021 1022 1023
                    infos[i : i + self._pool_size] = sorted(
                        infos[i : i + self._pool_size], key=lambda x: x.max_len
                    )
G
gongweibao 已提交
1024 1025 1026

        # concat batch
        batches = []
1027 1028 1029 1030 1031 1032 1033 1034
        batch_creator = (
            TokenBatchCreator(self._batch_size)
            if self._use_token_batch
            else SentenceBatchCreator(self._batch_size)
        )
        batch_creator = MinMaxFilter(
            self._max_length, self._min_length, batch_creator
        )
G
gongweibao 已提交
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052

        for info in infos:
            batch = batch_creator.append(info)
            if batch is not None:
                batches.append(batch)

        if not self._clip_last_batch and len(batch_creator.batch) != 0:
            batches.append(batch_creator.batch)

        if self._shuffle_batch:
            self._random.shuffle(batches)

        for batch in batches:
            batch_ids = [info.i for info in batch]

            if self._only_src:
                yield [[self._src_seq_ids[idx]] for idx in batch_ids]
            else:
1053 1054 1055 1056 1057 1058 1059 1060
                yield [
                    (
                        self._src_seq_ids[idx],
                        self._trg_seq_ids[idx][:-1],
                        self._trg_seq_ids[idx][1:],
                    )
                    for idx in batch_ids
                ]
G
gongweibao 已提交
1061 1062


1063
# from transformer_model import transformer
G
gongweibao 已提交
1064 1065 1066 1067
def position_encoding_init(n_position, d_pos_vec):
    """
    Generate the initial values for the sinusoid position encoding table.
    """
1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078
    position_enc = np.array(
        [
            [
                pos / np.power(10000, 2 * (j // 2) / d_pos_vec)
                for j in range(d_pos_vec)
            ]
            if pos != 0
            else np.zeros(d_pos_vec)
            for pos in range(n_position)
        ]
    )
G
gongweibao 已提交
1079 1080 1081 1082 1083
    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
    return position_enc.astype("float32")


1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
def multi_head_attention(
    queries,
    keys,
    values,
    attn_bias,
    d_key,
    d_value,
    d_model,
    n_head=1,
    dropout_rate=0.0,
    cache=None,
):
G
gongweibao 已提交
1096 1097 1098 1099 1100 1101 1102
    """
    Multi-Head Attention. Note that attn_bias is added to the logit before
    computing softmax activiation to mask certain selected positions so that
    they will not considered in attention weights.
    """
    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
        raise ValueError(
1103 1104
            "Inputs: queries, keys and values should all be 3-D tensors."
        )
G
gongweibao 已提交
1105 1106 1107 1108 1109

    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
        """
        Add linear projection to queries, keys, and values.
        """
1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130
        q = layers.fc(
            input=queries,
            size=d_key * n_head,
            num_flatten_dims=2,
            param_attr=const_para_attr,
            bias_attr=const_bias_attr,
        )
        k = layers.fc(
            input=keys,
            size=d_key * n_head,
            num_flatten_dims=2,
            param_attr=const_para_attr,
            bias_attr=const_bias_attr,
        )
        v = layers.fc(
            input=values,
            size=d_value * n_head,
            num_flatten_dims=2,
            param_attr=const_para_attr,
            bias_attr=const_bias_attr,
        )
G
gongweibao 已提交
1131 1132 1133 1134
        return q, k, v

    def __split_heads(x, n_head):
        """
T
tianshuo78520a 已提交
1135
        Reshape the last dimension of input tensor x so that it becomes two
G
gongweibao 已提交
1136 1137 1138 1139 1140 1141 1142 1143 1144 1145
        dimensions and then transpose. Specifically, input a tensor with shape
        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
        with shape [bs, n_head, max_sequence_length, hidden_dim].
        """
        if n_head == 1:
            return x

        hidden_size = x.shape[-1]
        # The value 0 in shape attr means copying the corresponding dimension
        # size of the input as the output dimension size.
1146
        reshaped = paddle.reshape(
1147 1148
            x=x, shape=[0, 0, n_head, hidden_size // n_head]
        )
G
gongweibao 已提交
1149

T
tianshuo78520a 已提交
1150
        # permute the dimensions into:
G
gongweibao 已提交
1151
        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
1152
        return paddle.transpose(x=reshaped, perm=[0, 2, 1, 3])
G
gongweibao 已提交
1153 1154 1155

    def __combine_heads(x):
        """
T
tianshuo78520a 已提交
1156
        Transpose and then reshape the last two dimensions of input tensor x
G
gongweibao 已提交
1157 1158
        so that it becomes one dimension, which is reverse to __split_heads.
        """
1159 1160
        if len(x.shape) == 3:
            return x
G
gongweibao 已提交
1161 1162 1163
        if len(x.shape) != 4:
            raise ValueError("Input(x) should be a 4-D Tensor.")

1164
        trans_x = paddle.transpose(x, perm=[0, 2, 1, 3])
G
gongweibao 已提交
1165 1166
        # The value 0 in shape attr means copying the corresponding dimension
        # size of the input as the output dimension size.
1167
        return paddle.reshape(
G
gongweibao 已提交
1168
            x=trans_x,
1169 1170
            shape=list(map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])),
        )
G
gongweibao 已提交
1171 1172 1173 1174 1175

    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
        """
        Scaled Dot-Product Attention
        """
2
201716010711 已提交
1176
        scaled_q = paddle.scale(x=q, scale=d_model**-0.5)
K
kangguangli 已提交
1177
        product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
G
gongweibao 已提交
1178 1179
        if attn_bias:
            product += attn_bias
1180
        weights = paddle.nn.functional.softmax(product)
G
gongweibao 已提交
1181
        if dropout_rate:
C
ccrrong 已提交
1182
            weights = paddle.nn.functional.dropout(
1183
                weights,
C
ccrrong 已提交
1184
                p=dropout_rate,
1185
            )
K
kangguangli 已提交
1186
        out = paddle.matmul(weights, v)
G
gongweibao 已提交
1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198
        return out

    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)

    if cache is not None:  # use cache and concat time steps
        k = cache["k"] = layers.concat([cache["k"], k], axis=1)
        v = cache["v"] = layers.concat([cache["v"], v], axis=1)

    q = __split_heads(q, n_head)
    k = __split_heads(k, n_head)
    v = __split_heads(v, n_head)

1199 1200 1201
    ctx_multiheads = scaled_dot_product_attention(
        q, k, v, attn_bias, d_model, dropout_rate
    )
G
gongweibao 已提交
1202 1203 1204 1205

    out = __combine_heads(ctx_multiheads)

    # Project back to the model size.
1206 1207 1208 1209 1210 1211 1212
    proj_out = layers.fc(
        input=out,
        size=d_model,
        num_flatten_dims=2,
        param_attr=const_para_attr,
        bias_attr=const_bias_attr,
    )
G
gongweibao 已提交
1213 1214 1215 1216 1217 1218 1219 1220 1221
    return proj_out


def positionwise_feed_forward(x, d_inner_hid, d_hid):
    """
    Position-wise Feed-Forward Networks.
    This module consists of two linear transformations with a ReLU activation
    in between, which is applied to each position separately and identically.
    """
1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236
    hidden = layers.fc(
        input=x,
        size=d_inner_hid,
        num_flatten_dims=2,
        act="relu",
        param_attr=const_para_attr,
        bias_attr=const_bias_attr,
    )
    out = layers.fc(
        input=hidden,
        size=d_hid,
        num_flatten_dims=2,
        param_attr=const_para_attr,
        bias_attr=const_bias_attr,
    )
G
gongweibao 已提交
1237 1238 1239
    return out


1240
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.0):
G
gongweibao 已提交
1241 1242 1243 1244 1245 1246 1247 1248 1249 1250
    """
    Add residual connection, layer normalization and droput to the out tensor
    optionally according to the value of process_cmd.
    This will be used before or after multi-head attention and position-wise
    feed-forward networks.
    """
    for cmd in process_cmd:
        if cmd == "a":  # add residual connection
            out = out + prev_out if prev_out else out
        elif cmd == "n":  # add layer normalization
1251
            out = paddle.static.nn.layer_norm(
1252 1253 1254 1255 1256
                out,
                begin_norm_axis=len(out.shape) - 1,
                param_attr=fluid.initializer.Constant(1.0),
                bias_attr=fluid.initializer.Constant(0.0),
            )
G
gongweibao 已提交
1257 1258
        elif cmd == "d":  # add dropout
            if dropout_rate:
C
ccrrong 已提交
1259
                out = paddle.nn.functional.dropout(
1260
                    out,
C
ccrrong 已提交
1261
                    p=dropout_rate,
1262
                )
G
gongweibao 已提交
1263 1264 1265 1266 1267 1268 1269
    return out


pre_process_layer = partial(pre_post_process_layer, None)
post_process_layer = pre_post_process_layer


1270 1271 1272 1273 1274 1275 1276 1277 1278 1279
def prepare_encoder(
    src_word,
    src_pos,
    src_vocab_size,
    src_emb_dim,
    src_max_len,
    dropout_rate=0.0,
    word_emb_param_name=None,
    pos_enc_param_name=None,
):
G
gongweibao 已提交
1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290
    """Add word embeddings and position encodings.
    The output tensor has a shape of:
    [batch_size, max_src_length_in_batch, d_model].
    This module is used at the bottom of the encoder stacks.
    """
    if TrainTaskConfig.check_acc:
        src_word_emb = layers.embedding(
            src_word,
            size=[src_vocab_size, src_emb_dim],
            param_attr=fluid.ParamAttr(
                name=word_emb_param_name,
1291 1292 1293
                initializer=fluid.initializer.ConstantInitializer(0.001),
            ),
        )
G
gongweibao 已提交
1294 1295 1296 1297
    else:
        src_word_emb = layers.embedding(
            src_word,
            size=[src_vocab_size, src_emb_dim],
1298 1299 1300 1301 1302
            param_attr=fluid.ParamAttr(
                name=word_emb_param_name,
                initializer=fluid.initializer.Normal(0.0, src_emb_dim**-0.5),
            ),
        )
G
gongweibao 已提交
1303

2
201716010711 已提交
1304
    src_word_emb = paddle.scale(x=src_word_emb, scale=src_emb_dim**0.5)
G
gongweibao 已提交
1305 1306 1307 1308 1309 1310
    src_pos_enc = layers.embedding(
        src_pos,
        size=[src_max_len, src_emb_dim],
        param_attr=fluid.ParamAttr(
            name=pos_enc_param_name,
            trainable=False,
1311 1312 1313
            initializer=fluid.initializer.ConstantInitializer(0.001),
        ),
    )
M
minqiyang 已提交
1314
    src_pos_enc.stop_gradient = True
G
gongweibao 已提交
1315
    enc_input = src_word_emb + src_pos_enc
1316
    return (
C
ccrrong 已提交
1317
        paddle.nn.functional.dropout(
1318
            enc_input,
C
ccrrong 已提交
1319
            p=dropout_rate,
1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343
        )
        if dropout_rate
        else enc_input
    )


prepare_encoder = partial(
    prepare_encoder, pos_enc_param_name=pos_enc_param_names[0]
)
prepare_decoder = partial(
    prepare_encoder, pos_enc_param_name=pos_enc_param_names[1]
)


def encoder_layer(
    enc_input,
    attn_bias,
    n_head,
    d_key,
    d_value,
    d_model,
    d_inner_hid,
    dropout_rate=0.0,
):
G
gongweibao 已提交
1344 1345 1346 1347 1348 1349
    """The encoder layers that can be stacked to form a deep encoder.
    This module consits of a multi-head (self) attention followed by
    position-wise feed-forward networks and both the two components companied
    with the post_process_layer to add residual connection, layer normalization
    and droput.
    """
1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363
    attn_output = multi_head_attention(
        enc_input,
        enc_input,
        enc_input,
        attn_bias,
        d_key,
        d_value,
        d_model,
        n_head,
        dropout_rate,
    )
    attn_output = post_process_layer(
        enc_input, attn_output, "dan", dropout_rate
    )
G
gongweibao 已提交
1364 1365 1366 1367
    ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model)
    return post_process_layer(attn_output, ffd_output, "dan", dropout_rate)


1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385
def encoder(
    enc_input,
    attn_bias,
    n_layer,
    n_head,
    d_key,
    d_value,
    d_model,
    d_inner_hid,
    dropout_rate=0.0,
):
    """
    The encoder is composed of a stack of identical layers returned by calling
    encoder_layer.
    """
    for i in range(n_layer):
        enc_output = encoder_layer(
            enc_input,
G
gongweibao 已提交
1386 1387 1388 1389 1390 1391
            attn_bias,
            n_head,
            d_key,
            d_value,
            d_model,
            d_inner_hid,
1392 1393
            dropout_rate,
        )
G
gongweibao 已提交
1394 1395 1396 1397
        enc_input = enc_output
    return enc_output


1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411
def decoder_layer(
    dec_input,
    enc_output,
    slf_attn_bias,
    dec_enc_attn_bias,
    n_head,
    d_key,
    d_value,
    d_model,
    d_inner_hid,
    dropout_rate=0.0,
    cache=None,
):
    """The layer to be stacked in decoder part.
G
gongweibao 已提交
1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424
    The structure of this module is similar to that in the encoder part except
    a multi-head attention is added to implement encoder-decoder attention.
    """
    slf_attn_output = multi_head_attention(
        dec_input,
        dec_input,
        dec_input,
        slf_attn_bias,
        d_key,
        d_value,
        d_model,
        n_head,
        dropout_rate,
1425 1426
        cache,
    )
G
gongweibao 已提交
1427 1428 1429 1430
    slf_attn_output = post_process_layer(
        dec_input,
        slf_attn_output,
        "dan",  # residual connection + dropout + layer normalization
1431 1432
        dropout_rate,
    )
G
gongweibao 已提交
1433 1434 1435 1436 1437 1438 1439 1440 1441
    enc_attn_output = multi_head_attention(
        slf_attn_output,
        enc_output,
        enc_output,
        dec_enc_attn_bias,
        d_key,
        d_value,
        d_model,
        n_head,
1442 1443
        dropout_rate,
    )
G
gongweibao 已提交
1444 1445 1446 1447
    enc_attn_output = post_process_layer(
        slf_attn_output,
        enc_attn_output,
        "dan",  # residual connection + dropout + layer normalization
1448 1449
        dropout_rate,
    )
G
gongweibao 已提交
1450 1451 1452
    ffd_output = positionwise_feed_forward(
        enc_attn_output,
        d_inner_hid,
1453 1454
        d_model,
    )
G
gongweibao 已提交
1455 1456 1457 1458
    dec_output = post_process_layer(
        enc_attn_output,
        ffd_output,
        "dan",  # residual connection + dropout + layer normalization
1459 1460
        dropout_rate,
    )
G
gongweibao 已提交
1461
    return dec_output
X
Xin Pan 已提交
1462 1463


1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477
def decoder(
    dec_input,
    enc_output,
    dec_slf_attn_bias,
    dec_enc_attn_bias,
    n_layer,
    n_head,
    d_key,
    d_value,
    d_model,
    d_inner_hid,
    dropout_rate=0.0,
    caches=None,
):
G
gongweibao 已提交
1478 1479 1480 1481 1482 1483 1484 1485
    """
    The decoder is composed of a stack of identical decoder_layer layers.
    """
    for i in range(n_layer):
        cache = None
        if caches is not None:
            cache = caches[i]

1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498
        dec_output = decoder_layer(
            dec_input,
            enc_output,
            dec_slf_attn_bias,
            dec_enc_attn_bias,
            n_head,
            d_key,
            d_value,
            d_model,
            d_inner_hid,
            dropout_rate,
            cache=cache,
        )
G
gongweibao 已提交
1499 1500 1501 1502 1503 1504 1505 1506 1507 1508
        dec_input = dec_output
    return dec_output


def make_all_inputs(input_fields):
    """
    Define the input data layers for the transformer model.
    """
    inputs = []
    for input_field in input_fields:
1509 1510 1511 1512 1513 1514 1515 1516 1517
        input_var = layers.data(
            name=input_field,
            shape=input_descs[input_field][0],
            dtype=input_descs[input_field][1],
            lod_level=input_descs[input_field][2]
            if len(input_descs[input_field]) == 3
            else 0,
            append_batch_size=False,
        )
G
gongweibao 已提交
1518 1519 1520 1521 1522
        inputs.append(input_var)
    return inputs


def transformer(
1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535
    src_vocab_size,
    trg_vocab_size,
    max_length,
    n_layer,
    n_head,
    d_key,
    d_value,
    d_model,
    d_inner_hid,
    dropout_rate,
    weight_sharing,
    label_smooth_eps,
):
G
gongweibao 已提交
1536
    if weight_sharing:
1537 1538 1539
        assert (
            src_vocab_size == src_vocab_size
        ), "Vocabularies in source and target should be same for weight sharing."
G
gongweibao 已提交
1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552
    enc_inputs = make_all_inputs(encoder_data_input_fields)

    enc_output = wrap_encoder(
        src_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate,
        weight_sharing,
1553 1554
        enc_inputs,
    )
G
gongweibao 已提交
1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569

    dec_inputs = make_all_inputs(decoder_data_input_fields[:-1])

    predict = wrap_decoder(
        trg_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate,
        weight_sharing,
        dec_inputs,
1570 1571
        enc_output,
    )
G
gongweibao 已提交
1572 1573 1574 1575 1576

    # Padding index do not contribute to the total loss. The weights is used to
    # cancel padding index in calculating the loss.
    label, weights = make_all_inputs(label_data_input_fields)
    if label_smooth_eps:
1577
        label = F.label_smooth(
1578
            label=paddle.nn.functional.one_hot(label, trg_vocab_size),
1579 1580
            epsilon=label_smooth_eps,
        )
G
gongweibao 已提交
1581

1582
    cost = paddle.nn.functional.softmax_with_cross_entropy(
1583
        logits=paddle.reshape(predict, shape=[-1, trg_vocab_size]),
G
gongweibao 已提交
1584
        label=label,
1585 1586
        soft_label=True if label_smooth_eps else False,
    )
G
gongweibao 已提交
1587
    weighted_cost = cost * weights
1588 1589
    sum_cost = paddle.sum(weighted_cost)
    token_num = paddle.sum(weights)
G
gongweibao 已提交
1590 1591 1592 1593 1594
    avg_cost = sum_cost / token_num
    avg_cost.stop_gradient = True
    return sum_cost, avg_cost, predict, token_num


1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607
def wrap_encoder(
    src_vocab_size,
    max_length,
    n_layer,
    n_head,
    d_key,
    d_value,
    d_model,
    d_inner_hid,
    dropout_rate,
    weight_sharing,
    enc_inputs=None,
):
G
gongweibao 已提交
1608 1609 1610 1611 1612
    """
    The wrapper assembles together all needed layers for the encoder.
    """
    if enc_inputs is None:
        # This is used to implement independent encoder program in inference.
1613 1614 1615
        src_word, src_pos, src_slf_attn_bias = make_all_inputs(
            encoder_data_input_fields
        )
G
gongweibao 已提交
1616
    else:
1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637
        src_word, src_pos, src_slf_attn_bias = enc_inputs
    enc_input = prepare_encoder(
        src_word,
        src_pos,
        src_vocab_size,
        d_model,
        max_length,
        dropout_rate,
        word_emb_param_name=word_emb_param_names[0],
    )
    enc_output = encoder(
        enc_input,
        src_slf_attn_bias,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate,
    )
G
gongweibao 已提交
1638 1639 1640
    return enc_output


1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655
def wrap_decoder(
    trg_vocab_size,
    max_length,
    n_layer,
    n_head,
    d_key,
    d_value,
    d_model,
    d_inner_hid,
    dropout_rate,
    weight_sharing,
    dec_inputs=None,
    enc_output=None,
    caches=None,
):
G
gongweibao 已提交
1656 1657 1658 1659 1660
    """
    The wrapper assembles together all needed layers for the decoder.
    """
    if dec_inputs is None:
        # This is used to implement independent decoder program in inference.
1661 1662 1663 1664 1665 1666 1667
        (
            trg_word,
            trg_pos,
            trg_slf_attn_bias,
            trg_src_attn_bias,
            enc_output,
        ) = make_all_inputs(decoder_data_input_fields)
G
gongweibao 已提交
1668 1669 1670
    else:
        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs

1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695
    dec_input = prepare_decoder(
        trg_word,
        trg_pos,
        trg_vocab_size,
        d_model,
        max_length,
        dropout_rate,
        word_emb_param_name=word_emb_param_names[0]
        if weight_sharing
        else word_emb_param_names[1],
    )
    dec_output = decoder(
        dec_input,
        enc_output,
        trg_slf_attn_bias,
        trg_src_attn_bias,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate,
        caches=caches,
    )
G
gongweibao 已提交
1696 1697
    # Return logits for training and probs for inference.
    if weight_sharing:
K
kangguangli 已提交
1698
        predict = paddle.matmul(
1699 1700 1701 1702
            x=dec_output,
            y=fluid.framework._get_var(word_emb_param_names[0]),
            transpose_y=True,
        )
G
gongweibao 已提交
1703
    else:
1704 1705 1706 1707 1708 1709 1710
        predict = layers.fc(
            input=dec_output,
            size=trg_vocab_size,
            num_flatten_dims=2,
            param_attr=const_para_attr,
            bias_attr=const_bias_attr,
        )
G
gongweibao 已提交
1711
    if dec_inputs is None:
1712
        predict = paddle.nn.functional.softmax(predict)
G
gongweibao 已提交
1713 1714 1715 1716 1717
    return predict


def get_model(is_dist, is_async):
    sum_cost, avg_cost, predict, token_num = transformer(
1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736
        ModelHyperParams.src_vocab_size,
        ModelHyperParams.trg_vocab_size,
        ModelHyperParams.max_length + 1,
        ModelHyperParams.n_layer,
        ModelHyperParams.n_head,
        ModelHyperParams.d_key,
        ModelHyperParams.d_value,
        ModelHyperParams.d_model,
        ModelHyperParams.d_inner_hid,
        ModelHyperParams.dropout,
        ModelHyperParams.weight_sharing,
        TrainTaskConfig.label_smooth_eps,
    )

    local_lr_scheduler = LearningRateScheduler(
        ModelHyperParams.d_model,
        TrainTaskConfig.warmup_steps,
        TrainTaskConfig.learning_rate,
    )
1737 1738
    # Context to do validation.
    test_program = fluid.default_main_program().clone(for_test=True)
G
gongweibao 已提交
1739 1740 1741 1742 1743 1744

    if not is_dist:
        optimizer = fluid.optimizer.Adam(
            learning_rate=local_lr_scheduler.learning_rate,
            beta1=TrainTaskConfig.beta1,
            beta2=TrainTaskConfig.beta2,
1745 1746
            epsilon=TrainTaskConfig.eps,
        )
G
gongweibao 已提交
1747 1748 1749 1750 1751
        optimizer.minimize(sum_cost)
    elif is_async:
        optimizer = fluid.optimizer.SGD(0.003)
        optimizer.minimize(sum_cost)
    else:
1752 1753 1754 1755 1756 1757 1758 1759 1760 1761
        lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
            ModelHyperParams.d_model, TrainTaskConfig.warmup_steps
        )

        optimizer = fluid.optimizer.Adam(
            learning_rate=lr_decay,
            beta1=TrainTaskConfig.beta1,
            beta2=TrainTaskConfig.beta2,
            epsilon=TrainTaskConfig.eps,
        )
G
gongweibao 已提交
1762 1763
        optimizer.minimize(sum_cost)

1764 1765 1766 1767 1768 1769 1770 1771
    return (
        sum_cost,
        avg_cost,
        predict,
        token_num,
        local_lr_scheduler,
        test_program,
    )
X
Xin Pan 已提交
1772 1773


G
gongweibao 已提交
1774 1775 1776 1777
def update_args():
    src_dict = DataReader.load_dict(TrainTaskConfig.src_vocab_fpath)
    trg_dict = DataReader.load_dict(TrainTaskConfig.trg_vocab_fpath)
    dict_args = [
1778
        "src_vocab_size",
1779 1780 1781 1782 1783 1784 1785 1786 1787
        str(len(src_dict)),
        "trg_vocab_size",
        str(len(trg_dict)),
        "bos_idx",
        str(src_dict[TrainTaskConfig.special_token[0]]),
        "eos_idx",
        str(src_dict[TrainTaskConfig.special_token[1]]),
        "unk_idx",
        str(src_dict[TrainTaskConfig.special_token[2]]),
G
gongweibao 已提交
1788 1789 1790 1791 1792
    ]
    merge_cfg_from_list(dict_args, [TrainTaskConfig, ModelHyperParams])


class DistTransformer2x2(TestDistRunnerBase):
W
Wu Yi 已提交
1793 1794
    def run_pserver(self, args):
        get_model(True, not args.sync_mode)
1795 1796 1797 1798 1799 1800 1801
        t = self.get_transpiler(
            args.trainer_id,
            fluid.default_main_program(),
            args.endpoints,
            args.trainers,
            args.sync_mode,
        )
W
Wu Yi 已提交
1802
        pserver_prog = t.get_pserver_program(args.current_endpoint)
1803 1804 1805
        startup_prog = t.get_startup_program(
            args.current_endpoint, pserver_prog
        )
X
Xin Pan 已提交
1806 1807 1808 1809 1810 1811

        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_prog)
        exe.run(pserver_prog)

1812 1813
    def run_trainer(self, args):
        TrainTaskConfig.use_gpu = args.use_cuda
1814 1815 1816 1817 1818 1819 1820 1821
        (
            sum_cost,
            avg_cost,
            predict,
            token_num,
            local_lr_scheduler,
            test_program,
        ) = get_model(args.is_dist, not args.sync_mode)
G
gongweibao 已提交
1822

W
Wu Yi 已提交
1823
        if args.is_dist:
1824 1825 1826 1827 1828 1829 1830
            t = self.get_transpiler(
                args.trainer_id,
                fluid.default_main_program(),
                args.endpoints,
                args.trainers,
                args.sync_mode,
            )
X
Xin Pan 已提交
1831
            trainer_prog = t.get_trainer_program()
G
gongweibao 已提交
1832
            TrainTaskConfig.batch_size = 10
1833 1834 1835 1836 1837 1838
            TrainTaskConfig.train_file_pattern = (
                TrainTaskConfig.data_path
                + "train.tok.clean.bpe.32000.en-de.train_{}".format(
                    args.trainer_id
                )
            )
X
Xin Pan 已提交
1839
        else:
G
gongweibao 已提交
1840
            TrainTaskConfig.batch_size = 20
X
Xin Pan 已提交
1841 1842
            trainer_prog = fluid.default_main_program()

1843 1844 1845 1846 1847
        if args.use_cuda:
            place = fluid.CUDAPlace(0)
        else:
            place = fluid.CPUPlace()

X
Xin Pan 已提交
1848
        startup_exe = fluid.Executor(place)
G
gongweibao 已提交
1849

W
Wu Yi 已提交
1850
        TrainTaskConfig.local = not args.is_dist
G
gongweibao 已提交
1851

1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862
        train_loop(
            startup_exe,
            trainer_prog,
            1,
            sum_cost,
            avg_cost,
            local_lr_scheduler,
            token_num,
            predict,
            test_program,
        )
X
Xin Pan 已提交
1863 1864 1865


if __name__ == "__main__":
G
gongweibao 已提交
1866 1867
    update_args()
    runtime_main(DistTransformer2x2)