Remove deprecated method

86b76ab6 · Yu Yang · eb25dcec · 86b76ab6 · 86b76ab6
2 changed file
--- a/fluid/neural_machine_translation/transformer/infer.py
+++ b/fluid/neural_machine_translation/transformer/infer.py
@@ -85,239 +85,6 @@ def parse_args():
    return args


-def translate_batch(exe,
-                    src_words,
-                    encoder,
-                    enc_in_names,
-                    enc_out_names,
-                    decoder,
-                    dec_in_names,
-                    dec_out_names,
-                    beam_size,
-                    max_length,
-                    n_best,
-                    batch_size,
-                    n_head,
-                    d_model,
-                    src_pad_idx,
-                    trg_pad_idx,
-                    bos_idx,
-                    eos_idx,
-                    unk_idx,
-                    output_unk=True):
-    """
-    Run the encoder program once and run the decoder program multiple times to
-    implement beam search externally. This is deprecated since a faster beam
-    search decoder based solely on Fluid operators has been added.
-    """
-    # Prepare data for encoder and run the encoder.
-    enc_in_data = pad_batch_data(
-        src_words,
-        src_pad_idx,
-        n_head,
-        is_target=False,
-        is_label=False,
-        return_attn_bias=True,
-        return_max_len=False)
-    # Append the data shape input to reshape the output of embedding layer.
-    enc_in_data = enc_in_data + [
-        np.array(
-            [-1, enc_in_data[2].shape[-1], d_model], dtype="int32")
-    ]
-    # Append the shape inputs to reshape before and after softmax in encoder
-    # self attention.
-    enc_in_data = enc_in_data + [
-        np.array(
-            [-1, enc_in_data[2].shape[-1]], dtype="int32"), np.array(
-                enc_in_data[2].shape, dtype="int32")
-    ]
-    enc_output = exe.run(encoder,
-                         feed=dict(zip(enc_in_names, enc_in_data)),
-                         fetch_list=enc_out_names)[0]
-
-    # Beam Search.
-    # To store the beam info.
-    scores = np.zeros((batch_size, beam_size), dtype="float32")
-    prev_branchs = [[] for i in range(batch_size)]
-    next_ids = [[] for i in range(batch_size)]
-    # Use beam_inst_map to map beam idx to the instance idx in batch, since the
-    # size of feeded batch is changing.
-    beam_inst_map = {
-        beam_idx: inst_idx
-        for inst_idx, beam_idx in enumerate(range(batch_size))
-    }
-    # Use active_beams to recode the alive.
-    active_beams = range(batch_size)
-
-    def beam_backtrace(prev_branchs, next_ids, n_best=beam_size):
-        """
-        Decode and select n_best sequences for one instance by backtrace.
-        """
-        seqs = []
-        for i in range(n_best):
-            k = i
-            seq = []
-            for j in range(len(prev_branchs) - 1, -1, -1):
-                seq.append(next_ids[j][k])
-                k = prev_branchs[j][k]
-            seq = seq[::-1]
-            # Add the <bos>, since next_ids don't include the <bos>.
-            seq = [bos_idx] + seq
-            seqs.append(seq)
-        return seqs
-
-    def init_dec_in_data(batch_size, beam_size, enc_in_data, enc_output):
-        """
-        Initialize the input data for decoder.
-        """
-        trg_words = np.array(
-            [[bos_idx]] * batch_size * beam_size, dtype="int64")
-        trg_pos = np.array([[1]] * batch_size * beam_size, dtype="int64")
-        src_max_length, src_slf_attn_bias, trg_max_len = enc_in_data[2].shape[
-            -1], enc_in_data[2], 1
-        # This is used to remove attention on subsequent words.
-        trg_slf_attn_bias = np.ones((batch_size * beam_size, trg_max_len,
-                                     trg_max_len))
-        trg_slf_attn_bias = np.triu(trg_slf_attn_bias, 1).reshape(
-            [-1, 1, trg_max_len, trg_max_len])
-        trg_slf_attn_bias = (np.tile(trg_slf_attn_bias, [1, n_head, 1, 1]) *
-                             [-1e9]).astype("float32")
-        # This is used to remove attention on the paddings of source sequences.
-        trg_src_attn_bias = np.tile(
-            src_slf_attn_bias[:, :, ::src_max_length, :][:, np.newaxis],
-            [1, beam_size, 1, trg_max_len, 1]).reshape([
-                -1, src_slf_attn_bias.shape[1], trg_max_len,
-                src_slf_attn_bias.shape[-1]
-            ])
-        # Append the shape input to reshape the output of embedding layer.
-        trg_data_shape = np.array(
-            [batch_size * beam_size, trg_max_len, d_model], dtype="int32")
-        # Append the shape inputs to reshape before and after softmax in
-        # decoder self attention.
-        trg_slf_attn_pre_softmax_shape = np.array(
-            [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
-        trg_slf_attn_post_softmax_shape = np.array(
-            trg_slf_attn_bias.shape, dtype="int32")
-        # Append the shape inputs to reshape before and after softmax in
-        # encoder-decoder attention.
-        trg_src_attn_pre_softmax_shape = np.array(
-            [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
-        trg_src_attn_post_softmax_shape = np.array(
-            trg_src_attn_bias.shape, dtype="int32")
-        enc_output = np.tile(
-            enc_output[:, np.newaxis], [1, beam_size, 1, 1]).reshape(
-                [-1, enc_output.shape[-2], enc_output.shape[-1]])
-        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
-            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
-            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
-            trg_src_attn_post_softmax_shape, enc_output
-
-    def update_dec_in_data(dec_in_data, next_ids, active_beams, beam_inst_map):
-        """
-        Update the input data of decoder mainly by slicing from the previous
-        input data and dropping the finished instance beams.
-        """
-        trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
-            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
-            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
-            trg_src_attn_post_softmax_shape, enc_output = dec_in_data
-        trg_cur_len = trg_slf_attn_bias.shape[-1] + 1
-        trg_words = np.array(
-            [
-                beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx])
-                for beam_idx in active_beams
-            ],
-            dtype="int64")
-        trg_words = trg_words.reshape([-1, 1])
-        trg_pos = np.array(
-            [range(1, trg_cur_len + 1)] * len(active_beams) * beam_size,
-            dtype="int64").reshape([-1, 1])
-        active_beams = [beam_inst_map[beam_idx] for beam_idx in active_beams]
-        active_beams_indice = (
-            (np.array(active_beams) * beam_size)[:, np.newaxis] +
-            np.array(range(beam_size))[np.newaxis, :]).flatten()
-        # This is used to remove attention on subsequent words.
-        trg_slf_attn_bias = np.ones((len(active_beams) * beam_size, trg_cur_len,
-                                     trg_cur_len))
-        trg_slf_attn_bias = np.triu(trg_slf_attn_bias, 1).reshape(
-            [-1, 1, trg_cur_len, trg_cur_len])
-        trg_slf_attn_bias = (np.tile(trg_slf_attn_bias, [1, n_head, 1, 1]) *
-                             [-1e9]).astype("float32")
-        # This is used to remove attention on the paddings of source sequences.
-        trg_src_attn_bias = np.tile(trg_src_attn_bias[
-            active_beams_indice, :, ::trg_src_attn_bias.shape[2], :],
-                                    [1, 1, trg_cur_len, 1])
-        # Append the shape input to reshape the output of embedding layer.
-        trg_data_shape = np.array(
-            [len(active_beams) * beam_size, trg_cur_len, d_model],
-            dtype="int32")
-        # Append the shape inputs to reshape before and after softmax in
-        # decoder self attention.
-        trg_slf_attn_pre_softmax_shape = np.array(
-            [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
-        trg_slf_attn_post_softmax_shape = np.array(
-            trg_slf_attn_bias.shape, dtype="int32")
-        # Append the shape inputs to reshape before and after softmax in
-        # encoder-decoder attention.
-        trg_src_attn_pre_softmax_shape = np.array(
-            [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
-        trg_src_attn_post_softmax_shape = np.array(
-            trg_src_attn_bias.shape, dtype="int32")
-        enc_output = enc_output[active_beams_indice, :, :]
-        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
-            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
-            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
-            trg_src_attn_post_softmax_shape, enc_output
-
-    dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data,
-                                   enc_output)
-    for i in range(max_length):
-        predict_all = exe.run(decoder,
-                              feed=dict(zip(dec_in_names, dec_in_data)),
-                              fetch_list=dec_out_names)[0]
-        predict_all = np.log(
-            predict_all.reshape([len(beam_inst_map) * beam_size, i + 1, -1])
-            [:, -1, :])
-        predict_all = (predict_all + scores[active_beams].reshape(
-            [len(beam_inst_map) * beam_size, -1])).reshape(
-                [len(beam_inst_map), beam_size, -1])
-        if not output_unk:  # To exclude the <unk> token.
-            predict_all[:, :, unk_idx] = -1e9
-        active_beams = []
-        for beam_idx in range(batch_size):
-            if not beam_inst_map.has_key(beam_idx):
-                continue
-            inst_idx = beam_inst_map[beam_idx]
-            predict = (predict_all[inst_idx, :, :]
-                       if i != 0 else predict_all[inst_idx, 0, :]).flatten()
-            top_k_indice = np.argpartition(predict, -beam_size)[-beam_size:]
-            top_scores_ids = top_k_indice[np.argsort(predict[top_k_indice])[::
-                                                                            -1]]
-            top_scores = predict[top_scores_ids]
-            scores[beam_idx] = top_scores
-            prev_branchs[beam_idx].append(top_scores_ids /
-                                          predict_all.shape[-1])
-            next_ids[beam_idx].append(top_scores_ids % predict_all.shape[-1])
-            if next_ids[beam_idx][-1][0] != eos_idx:
-                active_beams.append(beam_idx)
-        if len(active_beams) == 0:
-            break
-        dec_in_data = update_dec_in_data(dec_in_data, next_ids, active_beams,
-                                         beam_inst_map)
-        beam_inst_map = {
-            beam_idx: inst_idx
-            for inst_idx, beam_idx in enumerate(active_beams)
-        }
-
-    # Decode beams and select n_best sequences for each instance by backtrace.
-    seqs = [
-        beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx], n_best)
-        for beam_idx in range(batch_size)
-    ]
-
-    return seqs, scores[:, :n_best].tolist()
-
-
 def post_process_seq(seq,
                     bos_idx=ModelHyperParams.bos_idx,
                     eos_idx=ModelHyperParams.eos_idx,
@@ -339,91 +106,6 @@ def post_process_seq(seq,
        seq)


-def py_infer(test_data, trg_idx2word, use_wordpiece):
-    """
-    Inference by beam search implented by python, while the calculations from
-    symbols to probilities execute by Fluid operators.
-    """
-    place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    encoder_program = fluid.Program()
-    with fluid.program_guard(main_program=encoder_program):
-        enc_output = encoder(
-            ModelHyperParams.src_vocab_size, ModelHyperParams.max_length + 1,
-            ModelHyperParams.n_layer, ModelHyperParams.n_head,
-            ModelHyperParams.d_key, ModelHyperParams.d_value,
-            ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
-            ModelHyperParams.dropout, ModelHyperParams.weight_sharing)
-
-    decoder_program = fluid.Program()
-    with fluid.program_guard(main_program=decoder_program):
-        predict = decoder(
-            ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1,
-            ModelHyperParams.n_layer, ModelHyperParams.n_head,
-            ModelHyperParams.d_key, ModelHyperParams.d_value,
-            ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
-            ModelHyperParams.dropout, ModelHyperParams.weight_sharing)
-
-    # Load model parameters of encoder and decoder separately from the saved
-    # transformer model.
-    encoder_var_names = []
-    for op in encoder_program.block(0).ops:
-        encoder_var_names += op.input_arg_names
-    encoder_param_names = filter(
-        lambda var_name: isinstance(encoder_program.block(0).var(var_name),
-            fluid.framework.Parameter),
-        encoder_var_names)
-    encoder_params = map(encoder_program.block(0).var, encoder_param_names)
-    decoder_var_names = []
-    for op in decoder_program.block(0).ops:
-        decoder_var_names += op.input_arg_names
-    decoder_param_names = filter(
-        lambda var_name: isinstance(decoder_program.block(0).var(var_name),
-            fluid.framework.Parameter),
-        decoder_var_names)
-    decoder_params = map(decoder_program.block(0).var, decoder_param_names)
-    fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=encoder_params)
-    fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params)
-
-    # This is used here to set dropout to the test mode.
-    encoder_program = encoder_program.inference_optimize()
-    decoder_program = decoder_program.inference_optimize()
-
-    for batch_id, data in enumerate(test_data.batch_generator()):
-        batch_seqs, batch_scores = translate_batch(
-            exe,
-            [item[0] for item in data],
-            encoder_program,
-            encoder_data_input_fields + encoder_util_input_fields,
-            [enc_output.name],
-            decoder_program,
-            decoder_data_input_fields[:-1] + decoder_util_input_fields +
-            (decoder_data_input_fields[-1], ),
-            [predict.name],
-            InferTaskConfig.beam_size,
-            InferTaskConfig.max_out_len,
-            InferTaskConfig.n_best,
-            len(data),
-            ModelHyperParams.n_head,
-            ModelHyperParams.d_model,
-            ModelHyperParams.eos_idx,  # Use eos_idx to pad.
-            ModelHyperParams.eos_idx,  # Use eos_idx to pad.
-            ModelHyperParams.bos_idx,
-            ModelHyperParams.eos_idx,
-            ModelHyperParams.unk_idx,
-            output_unk=InferTaskConfig.output_unk)
-        for i in range(len(batch_seqs)):
-            # Post-process the beam-search decoded sequences.
-            seqs = map(post_process_seq, batch_seqs[i])
-            scores = batch_scores[i]
-            for seq in seqs:
-                if use_wordpiece:
-                    print(util.subword_ids_to_str(seq, trg_idx2word))
-                else:
-                    print(" ".join([trg_idx2word[idx] for idx in seq]))
-
-
 def prepare_batch_input(insts, data_input_names, src_pad_idx, bos_idx, n_head,
                        d_model, place):
    """

--- a/fluid/neural_machine_translation/transformer/model.py
+++ b/fluid/neural_machine_translation/transformer/model.py
@@ -335,6 +335,10 @@ def decoder(dec_input,
    The decoder is composed of a stack of identical decoder_layer layers.
    """
    for i in range(n_layer):
+        cache = None
+        if caches is not None:
+            cache = caches[i]
+
        dec_output = decoder_layer(
            dec_input,
            enc_output,
@@ -345,7 +349,8 @@ def decoder(dec_input,
            d_value,
            d_model,
            d_inner_hid,
-            dropout_rate, )
+            dropout_rate,
+            cache=cache)
        dec_input = dec_output
    return dec_output

@@ -515,7 +520,8 @@ def wrap_decoder(trg_vocab_size,
        d_value,
        d_model,
        d_inner_hid,
-        dropout_rate, )
+        dropout_rate,
+        caches=caches)
    # Return logits for training and probs for inference.
    if weight_sharing:
        predict = layers.matmul(
@@ -565,8 +571,7 @@ def fast_decode(
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)
        # array states will be stored for each step.
-        ids = layers.array_write(start_tokens, step_idx)
-        ids_flatten = layers.array_write(
+        ids = layers.array_write(
            layers.reshape(start_tokens, (-1, 1)), step_idx)
        scores = layers.array_write(init_scores, step_idx)
        # cell states will be overwrited at each step.
@@ -586,6 +591,7 @@ def fast_decode(
        } for i in range(n_layer)]
        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
+            pre_ids = layers.reshape(pre_ids, (-1, 1, 1))
            pre_scores = layers.array_read(array=scores, i=step_idx)
            # sequence_expand can gather sequences according to lod thus can be
            # used in beam search to sift states corresponding to selected ids.
@@ -642,8 +648,6 @@ def fast_decode(

            layers.increment(x=step_idx, value=1.0, in_place=True)
            # update states
-            layers.array_write(selected_ids, i=step_idx, array=ids_flatten)
-            selected_ids = layers.reshape(selected_ids, shape=(-1, 1, 1))
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
@@ -656,7 +660,7 @@ def fast_decode(
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
-            ids_flatten, scores, beam_size=beam_size, end_id=eos_idx)
+            ids, scores, beam_size=beam_size, end_id=eos_idx)
        return finished_ids, finished_scores

    finished_ids, finished_scores = beam_search()