infer.py 17.3 KB
Newer Older
1
import argparse
2 3
import numpy as np

4
import paddle
5 6 7 8 9
import paddle.fluid as fluid

import model
from model import wrap_encoder as encoder
from model import wrap_decoder as decoder
10
from config import *
11
from train import pad_batch_data
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
import reader


def parse_args():
    parser = argparse.ArgumentParser("Training for Transformer.")
    parser.add_argument(
        "--src_vocab_fpath",
        type=str,
        required=True,
        help="The path of vocabulary file of source language.")
    parser.add_argument(
        "--trg_vocab_fpath",
        type=str,
        required=True,
        help="The path of vocabulary file of target language.")
    parser.add_argument(
        "--test_file_pattern",
        type=str,
        required=True,
        help="The pattern to match test data files.")
    parser.add_argument(
        "--batch_size",
        type=int,
        default=50,
        help="The number of examples in one run for sequence generation.")
    parser.add_argument(
        "--pool_size",
        type=int,
        default=10000,
        help="The buffer size to pool data.")
    parser.add_argument(
        "--special_token",
        type=str,
        default=["<s>", "<e>", "<unk>"],
        nargs=3,
        help="The <bos>, <eos> and <unk> tokens in the dictionary.")
    parser.add_argument(
        'opts',
        help='See config.py for all options',
        default=None,
        nargs=argparse.REMAINDER)
    args = parser.parse_args()
54 55 56 57 58 59 60 61 62 63 64
    # Append args related to dict
    src_dict = reader.DataReader.load_dict(args.src_vocab_fpath)
    trg_dict = reader.DataReader.load_dict(args.trg_vocab_fpath)
    dict_args = [
        "src_vocab_size", str(len(src_dict)), "trg_vocab_size",
        str(len(trg_dict)), "bos_idx", str(src_dict[args.special_token[0]]),
        "eos_idx", str(src_dict[args.special_token[1]]), "unk_idx",
        str(src_dict[args.special_token[2]])
    ]
    merge_cfg_from_list(args.opts + dict_args,
                        [InferTaskConfig, ModelHyperParams])
65
    return args
66 67


68 69 70 71 72 73 74 75 76 77 78 79 80
def translate_batch(exe,
                    src_words,
                    encoder,
                    enc_in_names,
                    enc_out_names,
                    decoder,
                    dec_in_names,
                    dec_out_names,
                    beam_size,
                    max_length,
                    n_best,
                    batch_size,
                    n_head,
81
                    d_model,
82 83 84 85 86 87
                    src_pad_idx,
                    trg_pad_idx,
                    bos_idx,
                    eos_idx,
                    unk_idx,
                    output_unk=True):
88 89 90 91 92 93 94 95 96 97
    """
    Run the encoder program once and run the decoder program multiple times to
    implement beam search externally.
    """
    # Prepare data for encoder and run the encoder.
    enc_in_data = pad_batch_data(
        src_words,
        src_pad_idx,
        n_head,
        is_target=False,
98
        is_label=False,
99
        return_attn_bias=True,
G
guosheng 已提交
100
        return_max_len=False)
101 102
    # Append the data shape input to reshape the output of embedding layer.
    enc_in_data = enc_in_data + [
103
        np.array(
104 105
            [-1, enc_in_data[2].shape[-1], d_model], dtype="int32")
    ]
G
guosheng 已提交
106 107 108 109 110 111 112
    # Append the shape inputs to reshape before and after softmax in encoder
    # self attention.
    enc_in_data = enc_in_data + [
        np.array(
            [-1, enc_in_data[2].shape[-1]], dtype="int32"), np.array(
                enc_in_data[2].shape, dtype="int32")
    ]
113 114 115 116 117 118 119
    enc_output = exe.run(encoder,
                         feed=dict(zip(enc_in_names, enc_in_data)),
                         fetch_list=enc_out_names)[0]

    # Beam Search.
    # To store the beam info.
    scores = np.zeros((batch_size, beam_size), dtype="float32")
120 121 122
    prev_branchs = [[] for i in range(batch_size)]
    next_ids = [[] for i in range(batch_size)]
    # Use beam_inst_map to map beam idx to the instance idx in batch, since the
123
    # size of feeded batch is changing.
124 125 126 127 128 129
    beam_inst_map = {
        beam_idx: inst_idx
        for inst_idx, beam_idx in enumerate(range(batch_size))
    }
    # Use active_beams to recode the alive.
    active_beams = range(batch_size)
130

131
    def beam_backtrace(prev_branchs, next_ids, n_best=beam_size):
132 133 134 135 136 137 138 139 140 141 142
        """
        Decode and select n_best sequences for one instance by backtrace.
        """
        seqs = []
        for i in range(n_best):
            k = i
            seq = []
            for j in range(len(prev_branchs) - 1, -1, -1):
                seq.append(next_ids[j][k])
                k = prev_branchs[j][k]
            seq = seq[::-1]
143 144
            # Add the <bos>, since next_ids don't include the <bos>.
            seq = [bos_idx] + seq
145 146 147 148 149 150 151 152 153 154
            seqs.append(seq)
        return seqs

    def init_dec_in_data(batch_size, beam_size, enc_in_data, enc_output):
        """
        Initialize the input data for decoder.
        """
        trg_words = np.array(
            [[bos_idx]] * batch_size * beam_size, dtype="int64")
        trg_pos = np.array([[1]] * batch_size * beam_size, dtype="int64")
G
guosheng 已提交
155 156
        src_max_length, src_slf_attn_bias, trg_max_len = enc_in_data[2].shape[
            -1], enc_in_data[2], 1
157 158 159 160 161 162 163 164
        # This is used to remove attention on subsequent words.
        trg_slf_attn_bias = np.ones((batch_size * beam_size, trg_max_len,
                                     trg_max_len))
        trg_slf_attn_bias = np.triu(trg_slf_attn_bias, 1).reshape(
            [-1, 1, trg_max_len, trg_max_len])
        trg_slf_attn_bias = (np.tile(trg_slf_attn_bias, [1, n_head, 1, 1]) *
                             [-1e9]).astype("float32")
        # This is used to remove attention on the paddings of source sequences.
165
        trg_src_attn_bias = np.tile(
166 167 168 169 170
            src_slf_attn_bias[:, :, ::src_max_length, :][:, np.newaxis],
            [1, beam_size, 1, trg_max_len, 1]).reshape([
                -1, src_slf_attn_bias.shape[1], trg_max_len,
                src_slf_attn_bias.shape[-1]
            ])
171
        # Append the shape input to reshape the output of embedding layer.
172 173
        trg_data_shape = np.array(
            [batch_size * beam_size, trg_max_len, d_model], dtype="int32")
G
guosheng 已提交
174 175 176 177 178 179 180 181 182 183 184 185
        # Append the shape inputs to reshape before and after softmax in
        # decoder self attention.
        trg_slf_attn_pre_softmax_shape = np.array(
            [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
        trg_slf_attn_post_softmax_shape = np.array(
            trg_slf_attn_bias.shape, dtype="int32")
        # Append the shape inputs to reshape before and after softmax in
        # encoder-decoder attention.
        trg_src_attn_pre_softmax_shape = np.array(
            [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
        trg_src_attn_post_softmax_shape = np.array(
            trg_src_attn_bias.shape, dtype="int32")
186 187 188
        enc_output = np.tile(
            enc_output[:, np.newaxis], [1, beam_size, 1, 1]).reshape(
                [-1, enc_output.shape[-2], enc_output.shape[-1]])
G
guosheng 已提交
189
        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
190 191 192
            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
            trg_src_attn_post_softmax_shape, enc_output
193

194
    def update_dec_in_data(dec_in_data, next_ids, active_beams, beam_inst_map):
195 196 197 198
        """
        Update the input data of decoder mainly by slicing from the previous
        input data and dropping the finished instance beams.
        """
G
guosheng 已提交
199
        trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
200 201 202
            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
            trg_src_attn_post_softmax_shape, enc_output = dec_in_data
203
        trg_cur_len = trg_slf_attn_bias.shape[-1] + 1
204 205
        trg_words = np.array(
            [
206
                beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx])
207 208 209 210 211
                for beam_idx in active_beams
            ],
            dtype="int64")
        trg_words = trg_words.reshape([-1, 1])
        trg_pos = np.array(
212
            [range(1, trg_cur_len + 1)] * len(active_beams) * beam_size,
213
            dtype="int64").reshape([-1, 1])
214
        active_beams = [beam_inst_map[beam_idx] for beam_idx in active_beams]
215 216 217
        active_beams_indice = (
            (np.array(active_beams) * beam_size)[:, np.newaxis] +
            np.array(range(beam_size))[np.newaxis, :]).flatten()
218 219 220 221 222 223 224 225
        # This is used to remove attention on subsequent words.
        trg_slf_attn_bias = np.ones((len(active_beams) * beam_size, trg_cur_len,
                                     trg_cur_len))
        trg_slf_attn_bias = np.triu(trg_slf_attn_bias, 1).reshape(
            [-1, 1, trg_cur_len, trg_cur_len])
        trg_slf_attn_bias = (np.tile(trg_slf_attn_bias, [1, n_head, 1, 1]) *
                             [-1e9]).astype("float32")
        # This is used to remove attention on the paddings of source sequences.
226 227
        trg_src_attn_bias = np.tile(trg_src_attn_bias[
            active_beams_indice, :, ::trg_src_attn_bias.shape[2], :],
228
                                    [1, 1, trg_cur_len, 1])
229
        # Append the shape input to reshape the output of embedding layer.
230 231 232
        trg_data_shape = np.array(
            [len(active_beams) * beam_size, trg_cur_len, d_model],
            dtype="int32")
G
guosheng 已提交
233 234 235 236 237 238 239 240 241 242 243 244
        # Append the shape inputs to reshape before and after softmax in
        # decoder self attention.
        trg_slf_attn_pre_softmax_shape = np.array(
            [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
        trg_slf_attn_post_softmax_shape = np.array(
            trg_slf_attn_bias.shape, dtype="int32")
        # Append the shape inputs to reshape before and after softmax in
        # encoder-decoder attention.
        trg_src_attn_pre_softmax_shape = np.array(
            [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
        trg_src_attn_post_softmax_shape = np.array(
            trg_src_attn_bias.shape, dtype="int32")
245
        enc_output = enc_output[active_beams_indice, :, :]
G
guosheng 已提交
246
        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
247 248 249
            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
            trg_src_attn_post_softmax_shape, enc_output
250 251 252 253 254

    dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data,
                                   enc_output)
    for i in range(max_length):
        predict_all = exe.run(decoder,
255
                              feed=dict(zip(dec_in_names, dec_in_data)),
256
                              fetch_list=dec_out_names)[0]
257
        predict_all = np.log(
258 259
            predict_all.reshape([len(beam_inst_map) * beam_size, i + 1, -1])
            [:, -1, :])
260
        predict_all = (predict_all + scores[active_beams].reshape(
261 262
            [len(beam_inst_map) * beam_size, -1])).reshape(
                [len(beam_inst_map), beam_size, -1])
263 264
        if not output_unk:  # To exclude the <unk> token.
            predict_all[:, :, unk_idx] = -1e9
265
        active_beams = []
266 267 268 269
        for beam_idx in range(batch_size):
            if not beam_inst_map.has_key(beam_idx):
                continue
            inst_idx = beam_inst_map[beam_idx]
270 271 272 273 274 275 276 277 278 279 280 281
            predict = (predict_all[inst_idx, :, :]
                       if i != 0 else predict_all[inst_idx, 0, :]).flatten()
            top_k_indice = np.argpartition(predict, -beam_size)[-beam_size:]
            top_scores_ids = top_k_indice[np.argsort(predict[top_k_indice])[::
                                                                            -1]]
            top_scores = predict[top_scores_ids]
            scores[beam_idx] = top_scores
            prev_branchs[beam_idx].append(top_scores_ids /
                                          predict_all.shape[-1])
            next_ids[beam_idx].append(top_scores_ids % predict_all.shape[-1])
            if next_ids[beam_idx][-1][0] != eos_idx:
                active_beams.append(beam_idx)
282
        if len(active_beams) == 0:
283
            break
284 285 286 287 288 289
        dec_in_data = update_dec_in_data(dec_in_data, next_ids, active_beams,
                                         beam_inst_map)
        beam_inst_map = {
            beam_idx: inst_idx
            for inst_idx, beam_idx in enumerate(active_beams)
        }
290 291

    # Decode beams and select n_best sequences for each instance by backtrace.
292 293 294 295
    seqs = [
        beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx], n_best)
        for beam_idx in range(batch_size)
    ]
296 297 298 299

    return seqs, scores[:, :n_best].tolist()


300
def infer(args):
301 302
    place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
303

304 305 306
    encoder_program = fluid.Program()
    with fluid.program_guard(main_program=encoder_program):
        enc_output = encoder(
G
guosheng 已提交
307
            ModelHyperParams.src_vocab_size, ModelHyperParams.max_length + 1,
G
guosheng 已提交
308 309 310 311
            ModelHyperParams.n_layer, ModelHyperParams.n_head,
            ModelHyperParams.d_key, ModelHyperParams.d_value,
            ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
            ModelHyperParams.dropout)
312 313 314

    decoder_program = fluid.Program()
    with fluid.program_guard(main_program=decoder_program):
G
guosheng 已提交
315 316 317 318 319 320
        predict = decoder(
            ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1,
            ModelHyperParams.n_layer, ModelHyperParams.n_head,
            ModelHyperParams.d_key, ModelHyperParams.d_value,
            ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
            ModelHyperParams.dropout)
321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348

    # Load model parameters of encoder and decoder separately from the saved
    # transformer model.
    encoder_var_names = []
    for op in encoder_program.block(0).ops:
        encoder_var_names += op.input_arg_names
    encoder_param_names = filter(
        lambda var_name: isinstance(encoder_program.block(0).var(var_name),
            fluid.framework.Parameter),
        encoder_var_names)
    encoder_params = map(encoder_program.block(0).var, encoder_param_names)
    decoder_var_names = []
    for op in decoder_program.block(0).ops:
        decoder_var_names += op.input_arg_names
    decoder_param_names = filter(
        lambda var_name: isinstance(decoder_program.block(0).var(var_name),
            fluid.framework.Parameter),
        decoder_var_names)
    decoder_params = map(decoder_program.block(0).var, decoder_param_names)
    fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=encoder_params)
    fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params)

    # This is used here to set dropout to the test mode.
    encoder_program = fluid.io.get_inference_program(
        target_vars=[enc_output], main_program=encoder_program)
    decoder_program = fluid.io.get_inference_program(
        target_vars=[predict], main_program=decoder_program)

349 350 351 352 353 354 355 356 357 358 359 360 361
    test_data = reader.DataReader(
        src_vocab_fpath=args.src_vocab_fpath,
        trg_vocab_fpath=args.trg_vocab_fpath,
        fpattern=args.test_file_pattern,
        batch_size=args.batch_size,
        use_token_batch=False,
        pool_size=args.pool_size,
        sort_type=reader.SortType.NONE,
        shuffle=False,
        shuffle_batch=False,
        start_mark=args.special_token[0],
        end_mark=args.special_token[1],
        unk_mark=args.special_token[2],
G
guosheng 已提交
362
        max_length=ModelHyperParams.max_length,
363
        clip_last_batch=False)
364

365
    trg_idx2word = test_data.load_dict(
366
        dict_path=args.trg_vocab_fpath, reverse=True)
367

368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
    def post_process_seq(seq,
                         bos_idx=ModelHyperParams.bos_idx,
                         eos_idx=ModelHyperParams.eos_idx,
                         output_bos=InferTaskConfig.output_bos,
                         output_eos=InferTaskConfig.output_eos):
        """
        Post-process the beam-search decoded sequence. Truncate from the first
        <eos> and remove the <bos> and <eos> tokens currently.
        """
        eos_pos = len(seq) - 1
        for i, idx in enumerate(seq):
            if idx == eos_idx:
                eos_pos = i
                break
        seq = seq[:eos_pos + 1]
        return filter(
            lambda idx: (output_bos or idx != bos_idx) and \
                (output_eos or idx != eos_idx),
            seq)

388
    for batch_id, data in enumerate(test_data.batch_generator()):
389
        batch_seqs, batch_scores = translate_batch(
G
guosheng 已提交
390 391
            exe,
            [item[0] for item in data],
392
            encoder_program,
393
            encoder_data_input_fields + encoder_util_input_fields,
G
guosheng 已提交
394
            [enc_output.name],
395
            decoder_program,
396 397
            decoder_data_input_fields[:-1] + decoder_util_input_fields +
            (decoder_data_input_fields[-1], ),
G
guosheng 已提交
398
            [predict.name],
399 400 401 402 403
            InferTaskConfig.beam_size,
            InferTaskConfig.max_length,
            InferTaskConfig.n_best,
            len(data),
            ModelHyperParams.n_head,
404
            ModelHyperParams.d_model,
G
guosheng 已提交
405 406
            ModelHyperParams.eos_idx,  # Use eos_idx to pad.
            ModelHyperParams.eos_idx,  # Use eos_idx to pad.
407 408 409 410
            ModelHyperParams.bos_idx,
            ModelHyperParams.eos_idx,
            ModelHyperParams.unk_idx,
            output_unk=InferTaskConfig.output_unk)
411
        for i in range(len(batch_seqs)):
412 413
            # Post-process the beam-search decoded sequences.
            seqs = map(post_process_seq, batch_seqs[i])
414 415 416 417 418 419
            scores = batch_scores[i]
            for seq in seqs:
                print(" ".join([trg_idx2word[idx] for idx in seq]))


if __name__ == "__main__":
420 421
    args = parse_args()
    infer(args)