infer.py 24.1 KB
Newer Older
1
import argparse
2
import ast
3 4
import numpy as np

5
import paddle
6 7 8 9 10
import paddle.fluid as fluid

import model
from model import wrap_encoder as encoder
from model import wrap_decoder as decoder
11
from model import fast_decode as fast_decoder
12
from config import *
13
from train import pad_batch_data
14
import reader
15
import util
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50


def parse_args():
    parser = argparse.ArgumentParser("Training for Transformer.")
    parser.add_argument(
        "--src_vocab_fpath",
        type=str,
        required=True,
        help="The path of vocabulary file of source language.")
    parser.add_argument(
        "--trg_vocab_fpath",
        type=str,
        required=True,
        help="The path of vocabulary file of target language.")
    parser.add_argument(
        "--test_file_pattern",
        type=str,
        required=True,
        help="The pattern to match test data files.")
    parser.add_argument(
        "--batch_size",
        type=int,
        default=50,
        help="The number of examples in one run for sequence generation.")
    parser.add_argument(
        "--pool_size",
        type=int,
        default=10000,
        help="The buffer size to pool data.")
    parser.add_argument(
        "--special_token",
        type=str,
        default=["<s>", "<e>", "<unk>"],
        nargs=3,
        help="The <bos>, <eos> and <unk> tokens in the dictionary.")
51 52 53 54 55 56 57 58
    parser.add_argument(
        "--use_wordpiece",
        type=ast.literal_eval,
        default=False,
        help="The flag indicating if the data is wordpiece data. The EN-FR data we "
        "provided is wordpiece data. For wordpiece data, converting ids to "
        "original words is a little different and some special codes are "
        "provided in util.py to do this.")
59 60 61 62 63 64
    parser.add_argument(
        'opts',
        help='See config.py for all options',
        default=None,
        nargs=argparse.REMAINDER)
    args = parser.parse_args()
65 66 67 68 69 70 71 72 73 74 75
    # Append args related to dict
    src_dict = reader.DataReader.load_dict(args.src_vocab_fpath)
    trg_dict = reader.DataReader.load_dict(args.trg_vocab_fpath)
    dict_args = [
        "src_vocab_size", str(len(src_dict)), "trg_vocab_size",
        str(len(trg_dict)), "bos_idx", str(src_dict[args.special_token[0]]),
        "eos_idx", str(src_dict[args.special_token[1]]), "unk_idx",
        str(src_dict[args.special_token[2]])
    ]
    merge_cfg_from_list(args.opts + dict_args,
                        [InferTaskConfig, ModelHyperParams])
76
    return args
77 78


79 80 81 82 83 84 85 86 87 88 89 90 91
def translate_batch(exe,
                    src_words,
                    encoder,
                    enc_in_names,
                    enc_out_names,
                    decoder,
                    dec_in_names,
                    dec_out_names,
                    beam_size,
                    max_length,
                    n_best,
                    batch_size,
                    n_head,
92
                    d_model,
93 94 95 96 97 98
                    src_pad_idx,
                    trg_pad_idx,
                    bos_idx,
                    eos_idx,
                    unk_idx,
                    output_unk=True):
99 100
    """
    Run the encoder program once and run the decoder program multiple times to
101 102
    implement beam search externally. This is deprecated since a faster beam
    search decoder based solely on Fluid operators has been added.
103 104 105 106 107 108 109
    """
    # Prepare data for encoder and run the encoder.
    enc_in_data = pad_batch_data(
        src_words,
        src_pad_idx,
        n_head,
        is_target=False,
110
        is_label=False,
111
        return_attn_bias=True,
G
guosheng 已提交
112
        return_max_len=False)
113 114
    # Append the data shape input to reshape the output of embedding layer.
    enc_in_data = enc_in_data + [
115
        np.array(
116 117
            [-1, enc_in_data[2].shape[-1], d_model], dtype="int32")
    ]
G
guosheng 已提交
118 119 120 121 122 123 124
    # Append the shape inputs to reshape before and after softmax in encoder
    # self attention.
    enc_in_data = enc_in_data + [
        np.array(
            [-1, enc_in_data[2].shape[-1]], dtype="int32"), np.array(
                enc_in_data[2].shape, dtype="int32")
    ]
125 126 127 128 129 130 131
    enc_output = exe.run(encoder,
                         feed=dict(zip(enc_in_names, enc_in_data)),
                         fetch_list=enc_out_names)[0]

    # Beam Search.
    # To store the beam info.
    scores = np.zeros((batch_size, beam_size), dtype="float32")
132 133 134
    prev_branchs = [[] for i in range(batch_size)]
    next_ids = [[] for i in range(batch_size)]
    # Use beam_inst_map to map beam idx to the instance idx in batch, since the
135
    # size of feeded batch is changing.
136 137 138 139 140 141
    beam_inst_map = {
        beam_idx: inst_idx
        for inst_idx, beam_idx in enumerate(range(batch_size))
    }
    # Use active_beams to recode the alive.
    active_beams = range(batch_size)
142

143
    def beam_backtrace(prev_branchs, next_ids, n_best=beam_size):
144 145 146 147 148 149 150 151 152 153 154
        """
        Decode and select n_best sequences for one instance by backtrace.
        """
        seqs = []
        for i in range(n_best):
            k = i
            seq = []
            for j in range(len(prev_branchs) - 1, -1, -1):
                seq.append(next_ids[j][k])
                k = prev_branchs[j][k]
            seq = seq[::-1]
155 156
            # Add the <bos>, since next_ids don't include the <bos>.
            seq = [bos_idx] + seq
157 158 159 160 161 162 163 164 165 166
            seqs.append(seq)
        return seqs

    def init_dec_in_data(batch_size, beam_size, enc_in_data, enc_output):
        """
        Initialize the input data for decoder.
        """
        trg_words = np.array(
            [[bos_idx]] * batch_size * beam_size, dtype="int64")
        trg_pos = np.array([[1]] * batch_size * beam_size, dtype="int64")
G
guosheng 已提交
167 168
        src_max_length, src_slf_attn_bias, trg_max_len = enc_in_data[2].shape[
            -1], enc_in_data[2], 1
169 170 171 172 173 174 175 176
        # This is used to remove attention on subsequent words.
        trg_slf_attn_bias = np.ones((batch_size * beam_size, trg_max_len,
                                     trg_max_len))
        trg_slf_attn_bias = np.triu(trg_slf_attn_bias, 1).reshape(
            [-1, 1, trg_max_len, trg_max_len])
        trg_slf_attn_bias = (np.tile(trg_slf_attn_bias, [1, n_head, 1, 1]) *
                             [-1e9]).astype("float32")
        # This is used to remove attention on the paddings of source sequences.
177
        trg_src_attn_bias = np.tile(
178 179 180 181 182
            src_slf_attn_bias[:, :, ::src_max_length, :][:, np.newaxis],
            [1, beam_size, 1, trg_max_len, 1]).reshape([
                -1, src_slf_attn_bias.shape[1], trg_max_len,
                src_slf_attn_bias.shape[-1]
            ])
183
        # Append the shape input to reshape the output of embedding layer.
184 185
        trg_data_shape = np.array(
            [batch_size * beam_size, trg_max_len, d_model], dtype="int32")
G
guosheng 已提交
186 187 188 189 190 191 192 193 194 195 196 197
        # Append the shape inputs to reshape before and after softmax in
        # decoder self attention.
        trg_slf_attn_pre_softmax_shape = np.array(
            [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
        trg_slf_attn_post_softmax_shape = np.array(
            trg_slf_attn_bias.shape, dtype="int32")
        # Append the shape inputs to reshape before and after softmax in
        # encoder-decoder attention.
        trg_src_attn_pre_softmax_shape = np.array(
            [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
        trg_src_attn_post_softmax_shape = np.array(
            trg_src_attn_bias.shape, dtype="int32")
198 199 200
        enc_output = np.tile(
            enc_output[:, np.newaxis], [1, beam_size, 1, 1]).reshape(
                [-1, enc_output.shape[-2], enc_output.shape[-1]])
G
guosheng 已提交
201
        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
202 203 204
            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
            trg_src_attn_post_softmax_shape, enc_output
205

206
    def update_dec_in_data(dec_in_data, next_ids, active_beams, beam_inst_map):
207 208 209 210
        """
        Update the input data of decoder mainly by slicing from the previous
        input data and dropping the finished instance beams.
        """
G
guosheng 已提交
211
        trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
212 213 214
            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
            trg_src_attn_post_softmax_shape, enc_output = dec_in_data
215
        trg_cur_len = trg_slf_attn_bias.shape[-1] + 1
216 217
        trg_words = np.array(
            [
218
                beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx])
219 220 221 222 223
                for beam_idx in active_beams
            ],
            dtype="int64")
        trg_words = trg_words.reshape([-1, 1])
        trg_pos = np.array(
224
            [range(1, trg_cur_len + 1)] * len(active_beams) * beam_size,
225
            dtype="int64").reshape([-1, 1])
226
        active_beams = [beam_inst_map[beam_idx] for beam_idx in active_beams]
227 228 229
        active_beams_indice = (
            (np.array(active_beams) * beam_size)[:, np.newaxis] +
            np.array(range(beam_size))[np.newaxis, :]).flatten()
230 231 232 233 234 235 236 237
        # This is used to remove attention on subsequent words.
        trg_slf_attn_bias = np.ones((len(active_beams) * beam_size, trg_cur_len,
                                     trg_cur_len))
        trg_slf_attn_bias = np.triu(trg_slf_attn_bias, 1).reshape(
            [-1, 1, trg_cur_len, trg_cur_len])
        trg_slf_attn_bias = (np.tile(trg_slf_attn_bias, [1, n_head, 1, 1]) *
                             [-1e9]).astype("float32")
        # This is used to remove attention on the paddings of source sequences.
238 239
        trg_src_attn_bias = np.tile(trg_src_attn_bias[
            active_beams_indice, :, ::trg_src_attn_bias.shape[2], :],
240
                                    [1, 1, trg_cur_len, 1])
241
        # Append the shape input to reshape the output of embedding layer.
242 243 244
        trg_data_shape = np.array(
            [len(active_beams) * beam_size, trg_cur_len, d_model],
            dtype="int32")
G
guosheng 已提交
245 246 247 248 249 250 251 252 253 254 255 256
        # Append the shape inputs to reshape before and after softmax in
        # decoder self attention.
        trg_slf_attn_pre_softmax_shape = np.array(
            [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
        trg_slf_attn_post_softmax_shape = np.array(
            trg_slf_attn_bias.shape, dtype="int32")
        # Append the shape inputs to reshape before and after softmax in
        # encoder-decoder attention.
        trg_src_attn_pre_softmax_shape = np.array(
            [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
        trg_src_attn_post_softmax_shape = np.array(
            trg_src_attn_bias.shape, dtype="int32")
257
        enc_output = enc_output[active_beams_indice, :, :]
G
guosheng 已提交
258
        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
259 260 261
            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
            trg_src_attn_post_softmax_shape, enc_output
262 263 264 265 266

    dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data,
                                   enc_output)
    for i in range(max_length):
        predict_all = exe.run(decoder,
267
                              feed=dict(zip(dec_in_names, dec_in_data)),
268
                              fetch_list=dec_out_names)[0]
269
        predict_all = np.log(
270 271
            predict_all.reshape([len(beam_inst_map) * beam_size, i + 1, -1])
            [:, -1, :])
272
        predict_all = (predict_all + scores[active_beams].reshape(
273 274
            [len(beam_inst_map) * beam_size, -1])).reshape(
                [len(beam_inst_map), beam_size, -1])
275 276
        if not output_unk:  # To exclude the <unk> token.
            predict_all[:, :, unk_idx] = -1e9
277
        active_beams = []
278 279 280 281
        for beam_idx in range(batch_size):
            if not beam_inst_map.has_key(beam_idx):
                continue
            inst_idx = beam_inst_map[beam_idx]
282 283 284 285 286 287 288 289 290 291 292 293
            predict = (predict_all[inst_idx, :, :]
                       if i != 0 else predict_all[inst_idx, 0, :]).flatten()
            top_k_indice = np.argpartition(predict, -beam_size)[-beam_size:]
            top_scores_ids = top_k_indice[np.argsort(predict[top_k_indice])[::
                                                                            -1]]
            top_scores = predict[top_scores_ids]
            scores[beam_idx] = top_scores
            prev_branchs[beam_idx].append(top_scores_ids /
                                          predict_all.shape[-1])
            next_ids[beam_idx].append(top_scores_ids % predict_all.shape[-1])
            if next_ids[beam_idx][-1][0] != eos_idx:
                active_beams.append(beam_idx)
294
        if len(active_beams) == 0:
295
            break
296 297 298 299 300 301
        dec_in_data = update_dec_in_data(dec_in_data, next_ids, active_beams,
                                         beam_inst_map)
        beam_inst_map = {
            beam_idx: inst_idx
            for inst_idx, beam_idx in enumerate(active_beams)
        }
302 303

    # Decode beams and select n_best sequences for each instance by backtrace.
304 305 306 307
    seqs = [
        beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx], n_best)
        for beam_idx in range(batch_size)
    ]
308 309 310 311

    return seqs, scores[:, :n_best].tolist()


312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
def post_process_seq(seq,
                     bos_idx=ModelHyperParams.bos_idx,
                     eos_idx=ModelHyperParams.eos_idx,
                     output_bos=InferTaskConfig.output_bos,
                     output_eos=InferTaskConfig.output_eos):
    """
    Post-process the beam-search decoded sequence. Truncate from the first
    <eos> and remove the <bos> and <eos> tokens currently.
    """
    eos_pos = len(seq) - 1
    for i, idx in enumerate(seq):
        if idx == eos_idx:
            eos_pos = i
            break
    seq = seq[:eos_pos + 1]
    return filter(
        lambda idx: (output_bos or idx != bos_idx) and \
            (output_eos or idx != eos_idx),
        seq)


333
def py_infer(test_data, trg_idx2word, use_wordpiece):
334 335 336 337
    """
    Inference by beam search implented by python, while the calculations from
    symbols to probilities execute by Fluid operators.
    """
338 339
    place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
340

341 342 343
    encoder_program = fluid.Program()
    with fluid.program_guard(main_program=encoder_program):
        enc_output = encoder(
G
guosheng 已提交
344
            ModelHyperParams.src_vocab_size, ModelHyperParams.max_length + 1,
G
guosheng 已提交
345 346 347
            ModelHyperParams.n_layer, ModelHyperParams.n_head,
            ModelHyperParams.d_key, ModelHyperParams.d_value,
            ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
348
            ModelHyperParams.dropout, ModelHyperParams.weight_sharing)
349 350 351

    decoder_program = fluid.Program()
    with fluid.program_guard(main_program=decoder_program):
G
guosheng 已提交
352 353 354 355 356
        predict = decoder(
            ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1,
            ModelHyperParams.n_layer, ModelHyperParams.n_head,
            ModelHyperParams.d_key, ModelHyperParams.d_value,
            ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
357
            ModelHyperParams.dropout, ModelHyperParams.weight_sharing)
358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380

    # Load model parameters of encoder and decoder separately from the saved
    # transformer model.
    encoder_var_names = []
    for op in encoder_program.block(0).ops:
        encoder_var_names += op.input_arg_names
    encoder_param_names = filter(
        lambda var_name: isinstance(encoder_program.block(0).var(var_name),
            fluid.framework.Parameter),
        encoder_var_names)
    encoder_params = map(encoder_program.block(0).var, encoder_param_names)
    decoder_var_names = []
    for op in decoder_program.block(0).ops:
        decoder_var_names += op.input_arg_names
    decoder_param_names = filter(
        lambda var_name: isinstance(decoder_program.block(0).var(var_name),
            fluid.framework.Parameter),
        decoder_var_names)
    decoder_params = map(decoder_program.block(0).var, decoder_param_names)
    fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=encoder_params)
    fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params)

    # This is used here to set dropout to the test mode.
381 382
    encoder_program = encoder_program.inference_optimize()
    decoder_program = decoder_program.inference_optimize()
383

384
    for batch_id, data in enumerate(test_data.batch_generator()):
385
        batch_seqs, batch_scores = translate_batch(
G
guosheng 已提交
386 387
            exe,
            [item[0] for item in data],
388
            encoder_program,
389
            encoder_data_input_fields + encoder_util_input_fields,
G
guosheng 已提交
390
            [enc_output.name],
391
            decoder_program,
392 393
            decoder_data_input_fields[:-1] + decoder_util_input_fields +
            (decoder_data_input_fields[-1], ),
G
guosheng 已提交
394
            [predict.name],
395
            InferTaskConfig.beam_size,
396
            InferTaskConfig.max_out_len,
397 398 399
            InferTaskConfig.n_best,
            len(data),
            ModelHyperParams.n_head,
400
            ModelHyperParams.d_model,
G
guosheng 已提交
401 402
            ModelHyperParams.eos_idx,  # Use eos_idx to pad.
            ModelHyperParams.eos_idx,  # Use eos_idx to pad.
403 404 405 406
            ModelHyperParams.bos_idx,
            ModelHyperParams.eos_idx,
            ModelHyperParams.unk_idx,
            output_unk=InferTaskConfig.output_unk)
407
        for i in range(len(batch_seqs)):
408 409
            # Post-process the beam-search decoded sequences.
            seqs = map(post_process_seq, batch_seqs[i])
410 411
            scores = batch_scores[i]
            for seq in seqs:
412 413 414 415
                if use_wordpiece:
                    print(util.subword_ids_to_str(seq, trg_idx2word))
                else:
                    print(" ".join([trg_idx2word[idx] for idx in seq]))
416 417


418 419 420
def prepare_batch_input(insts, data_input_names, util_input_names, src_pad_idx,
                        bos_idx, n_head, d_model, place):
    """
421
    Put all padded data needed by beam search decoder into a dict.
422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
    """
    src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data(
        [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False)
    # start tokens
    trg_word = np.asarray([[bos_idx]] * len(insts), dtype="int64")
    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
                                [1, 1, 1, 1]).astype("float32")

    # These shape tensors are used in reshape_op.
    src_data_shape = np.array([-1, src_max_len, d_model], dtype="int32")
    trg_data_shape = np.array([-1, 1, d_model], dtype="int32")
    src_slf_attn_pre_softmax_shape = np.array(
        [-1, src_slf_attn_bias.shape[-1]], dtype="int32")
    src_slf_attn_post_softmax_shape = np.array(
        [-1] + list(src_slf_attn_bias.shape[1:]), dtype="int32")
    trg_slf_attn_pre_softmax_shape = np.array(
        [-1, 1], dtype="int32")  # only the first time step
    trg_slf_attn_post_softmax_shape = np.array(
        [-1, n_head, 1, 1], dtype="int32")  # only the first time step
    trg_src_attn_pre_softmax_shape = np.array(
        [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
    trg_src_attn_post_softmax_shape = np.array(
        [-1] + list(trg_src_attn_bias.shape[1:]), dtype="int32")
    # These inputs are used to change the shapes in the loop of while op.
    attn_pre_softmax_shape_delta = np.array([0, 1], dtype="int32")
    attn_post_softmax_shape_delta = np.array([0, 0, 0, 1], dtype="int32")

    def to_lodtensor(data, place, lod=None):
        data_tensor = fluid.LoDTensor()
        data_tensor.set(data, place)
        if lod is not None:
            data_tensor.set_lod(lod)
        return data_tensor

    # beamsearch_op must use tensors with lod
    init_score = to_lodtensor(
        np.zeros_like(
            trg_word, dtype="float32"),
        place, [range(trg_word.shape[0] + 1)] * 2)
    trg_word = to_lodtensor(trg_word, place, [range(trg_word.shape[0] + 1)] * 2)

    data_input_dict = dict(
        zip(data_input_names, [
            src_word, src_pos, src_slf_attn_bias, trg_word, init_score,
            trg_src_attn_bias
        ]))
    util_input_dict = dict(
        zip(util_input_names, [
            src_data_shape, src_slf_attn_pre_softmax_shape,
            src_slf_attn_post_softmax_shape, trg_data_shape,
            trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape,
            trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape,
            attn_pre_softmax_shape_delta, attn_post_softmax_shape_delta
        ]))

    input_dict = dict(data_input_dict.items() + util_input_dict.items())
    return input_dict


481
def fast_infer(test_data, trg_idx2word, use_wordpiece):
482 483 484
    """
    Inference by beam search decoder based solely on Fluid operators.
    """
485 486 487
    place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

488
    out_ids, out_scores = fast_decoder(
489 490 491 492 493
        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
        ModelHyperParams.n_head, ModelHyperParams.d_key,
        ModelHyperParams.d_value, ModelHyperParams.d_model,
        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
494 495
        ModelHyperParams.weight_sharing, InferTaskConfig.beam_size,
        InferTaskConfig.max_out_len, ModelHyperParams.eos_idx)
496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513

    fluid.io.load_vars(
        exe,
        InferTaskConfig.model_path,
        vars=filter(lambda var: isinstance(var, fluid.framework.Parameter),
                    fluid.default_main_program().list_vars()))

    # This is used here to set dropout to the test mode.
    infer_program = fluid.default_main_program().inference_optimize()

    for batch_id, data in enumerate(test_data.batch_generator()):
        data_input = prepare_batch_input(
            data, encoder_data_input_fields + fast_decoder_data_input_fields,
            encoder_util_input_fields + fast_decoder_util_input_fields,
            ModelHyperParams.eos_idx, ModelHyperParams.bos_idx,
            ModelHyperParams.n_head, ModelHyperParams.d_model, place)
        seq_ids, seq_scores = exe.run(infer_program,
                                      feed=data_input,
514
                                      fetch_list=[out_ids, out_scores],
515
                                      return_numpy=False)
516 517 518 519 520 521 522 523
        # How to parse the results:
        #   Suppose the lod of seq_ids is:
        #     [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]]
        #   then from lod[0]:
        #     there are 2 source sentences, beam width is 3.
        #   from lod[1]:
        #     the first source sentence has 3 hyps; the lengths are 12, 12, 16
        #     the second source sentence has 3 hyps; the lengths are 14, 13, 15
524
        hyps = [[] for i in range(len(data))]
525
        scores = [[] for i in range(len(data))]
526 527 528 529 530 531 532 533
        for i in range(len(seq_ids.lod()[0]) - 1):  # for each source sentence
            start = seq_ids.lod()[0][i]
            end = seq_ids.lod()[0][i + 1]
            for j in range(end - start):  # for each candidate
                sub_start = seq_ids.lod()[1][start + j]
                sub_end = seq_ids.lod()[1][start + j + 1]
                hyps[i].append(" ".join([
                    trg_idx2word[idx]
534 535
                    for idx in post_process_seq(
                        np.array(seq_ids)[sub_start:sub_end])
536 537 538
                ]) if not use_wordpiece else util.subword_ids_to_str(
                    post_process_seq(np.array(seq_ids)[sub_start:sub_end]),
                    trg_idx2word))
539
                scores[i].append(np.array(seq_scores)[sub_end - 1])
540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560
                print hyps[i][-1]
                if len(hyps[i]) >= InferTaskConfig.n_best:
                    break


def infer(args, inferencer=fast_infer):
    place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    test_data = reader.DataReader(
        src_vocab_fpath=args.src_vocab_fpath,
        trg_vocab_fpath=args.trg_vocab_fpath,
        fpattern=args.test_file_pattern,
        batch_size=args.batch_size,
        use_token_batch=False,
        pool_size=args.pool_size,
        sort_type=reader.SortType.NONE,
        shuffle=False,
        shuffle_batch=False,
        start_mark=args.special_token[0],
        end_mark=args.special_token[1],
        unk_mark=args.special_token[2],
561 562
        # count start and end tokens out
        max_length=ModelHyperParams.max_length - 2,
563 564 565
        clip_last_batch=False)
    trg_idx2word = test_data.load_dict(
        dict_path=args.trg_vocab_fpath, reverse=True)
566
    inferencer(test_data, trg_idx2word, args.use_wordpiece)
567 568


569
if __name__ == "__main__":
570
    args = parse_args()
571
    infer(args)