Make step outputs between fast_infer and the original python infer alignment in Transformer

82ba5c03 · guosheng · 2b553441 · 82ba5c03 · 82ba5c03
2 changed file
--- a/fluid/neural_machine_translation/transformer/infer.py
+++ b/fluid/neural_machine_translation/transformer/infer.py
@@ -255,6 +255,8 @@ def translate_batch(exe,
        predict_all = exe.run(decoder,
                              feed=dict(zip(dec_in_names, dec_in_data)),
                              fetch_list=dec_out_names)[0]
+        print predict_all.reshape(
+            [len(beam_inst_map) * beam_size, i + 1, -1])[:, -1, :]
        predict_all = np.log(
            predict_all.reshape([len(beam_inst_map) * beam_size, i + 1, -1])
            [:, -1, :])
@@ -273,11 +275,19 @@ def translate_batch(exe,
            top_k_indice = np.argpartition(predict, -beam_size)[-beam_size:]
            top_scores_ids = top_k_indice[np.argsort(predict[top_k_indice])[::
                                                                            -1]]
+            top_scores_ids = np.asarray(
+                sorted(
+                    top_scores_ids,
+                    lambda x, y: x / predict_all.shape[-1] - y / predict_all.shape[-1]
+                ))  # sort by pre_branch and score to compare with fast_infer
            top_scores = predict[top_scores_ids]
            scores[beam_idx] = top_scores
            prev_branchs[beam_idx].append(top_scores_ids /
                                          predict_all.shape[-1])
            next_ids[beam_idx].append(top_scores_ids % predict_all.shape[-1])
+            print prev_branchs[beam_idx][-1]
+            print next_ids[beam_idx][-1]
+            print top_scores
            if next_ids[beam_idx][-1][0] != eos_idx:
                active_beams.append(beam_idx)
        if len(active_beams) == 0:
@@ -342,10 +352,8 @@ def infer(args):
    fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params)

    # This is used here to set dropout to the test mode.
-    encoder_program = fluid.io.get_inference_program(
-        target_vars=[enc_output], main_program=encoder_program)
-    decoder_program = fluid.io.get_inference_program(
-        target_vars=[predict], main_program=decoder_program)
+    encoder_program = encoder_program.inference_optimize()
+    decoder_program = decoder_program.inference_optimize()

    test_data = reader.DataReader(
        src_vocab_fpath=args.src_vocab_fpath,
@@ -543,8 +551,10 @@ def fast_infer(args):
                    for idx in np.array(seq_ids)[sub_start:sub_end]
                ]))
            print hyps[i]
+            print len(hyps[i]), [len(hyp.split()) for hyp in hyps[i]]


 if __name__ == "__main__":
    args = parse_args()
    fast_infer(args)
+    # infer(args)
--- a/fluid/neural_machine_translation/transformer/model.py
+++ b/fluid/neural_machine_translation/transformer/model.py
@@ -6,6 +6,8 @@ import paddle.fluid.layers as layers

 from config import *

+FLAG = False
+

 def position_encoding_init(n_position, d_pos_vec):
    """
@@ -121,6 +123,15 @@ def multi_head_attention(queries,
            act="softmax")
        weights = layers.reshape(
            x=weights, shape=product.shape, actual_shape=post_softmax_shape)
+        global FLAG
+        if FLAG:
+            print "hehehehehe"
+            layers.Print(scaled_q)
+            layers.Print(k)
+            layers.Print(v)
+            layers.Print(product)
+            layers.Print(weights)
+            FLAG = False
        if dropout_rate:
            weights = layers.dropout(
                weights, dropout_prob=dropout_rate, is_test=False)
@@ -133,6 +144,13 @@ def multi_head_attention(queries,
    if cache is not None:  # use cache and concat time steps
        k = cache["k"] = layers.concat([cache["k"], k], axis=1)
        v = cache["v"] = layers.concat([cache["v"], v], axis=1)
+    # global FLAG
+    # if FLAG:
+    #     print "hehehehehe"
+    #     layers.Print(q)
+    #     layers.Print(k)
+    #     layers.Print(v)
+    #     FLAG = False
    q = __split_heads(q, n_head)
    k = __split_heads(k, n_head)
    v = __split_heads(v, n_head)
@@ -147,6 +165,11 @@ def multi_head_attention(queries,
                         param_attr=fluid.initializer.Xavier(uniform=False),
                         bias_attr=False,
                         num_flatten_dims=2)
+    # global FLAG
+    # if FLAG:
+    #     print "hehehehehe"
+    #     layers.Print(proj_out)
+    #     FLAG = False
    return proj_out


@@ -377,6 +400,9 @@ def decoder(dec_input,
    The decoder is composed of a stack of identical decoder_layer layers.
    """
    for i in range(n_layer):
+        if i == 0:  #n_layer-1:
+            global FLAG
+            FLAG = True
        dec_output = decoder_layer(
            dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias, n_head,
            d_key, d_value, d_model, d_inner_hid, dropout_rate,
@@ -572,7 +598,7 @@ def wrap_decoder(trg_vocab_size,
                    bias_attr=False,
                    num_flatten_dims=2),
        shape=[-1, trg_vocab_size],
-        act="softmax" if dec_inputs is None else None)
+        act="softmax")  # if dec_inputs is None else None)
    return predict


@@ -645,7 +671,8 @@ def fast_decode(
                "v": layers.sequence_expand(
                    x=cache["v"], y=pre_scores),
            } for cache in caches]
-            layers.Print(pre_ids)
+            # layers.Print(pre_ids)
+            # layers.Print(pre_pos)
            # layers.Print(pre_enc_output)
            # layers.Print(pre_src_attn_bias)
            # layers.Print(pre_caches[0]["k"])
@@ -667,9 +694,13 @@ def fast_decode(
                    src_attn_pre_softmax_shape, src_attn_post_softmax_shape),
                enc_output=pre_enc_output,
                caches=pre_caches)
+            layers.Print(logits)
            topk_scores, topk_indices = layers.topk(logits, k=beam_size)
+            # layers.Print(topk_scores)
+            # layers.Print(topk_indices)
            accu_scores = layers.elementwise_add(
-                x=layers.log(x=layers.softmax(topk_scores)),
+                # x=layers.log(x=layers.softmax(topk_scores)),
+                x=layers.log(topk_scores),
                y=layers.reshape(
                    pre_scores, shape=[-1]),
                axis=0)
@@ -690,8 +721,13 @@ def fast_decode(
            for i in range(n_layer):
                layers.assign(pre_caches[i]["k"], caches[i]["k"])
                layers.assign(pre_caches[i]["v"], caches[i]["v"])
+            layers.Print(selected_ids)
+            layers.Print(selected_scores)
+            # layers.Print(caches[-1]["k"])
            layers.assign(
-                slf_attn_pre_softmax_shape + attn_pre_softmax_shape_delta,
+                layers.elementwise_add(
+                    x=slf_attn_pre_softmax_shape,
+                    y=attn_pre_softmax_shape_delta),
                slf_attn_pre_softmax_shape)
            layers.assign(
                layers.elementwise_add(
@@ -703,7 +739,8 @@ def fast_decode(
            all_finish_cond = layers.less_than(x=step_idx, y=max_len)
            layers.logical_or(x=max_len_cond, y=all_finish_cond, out=cond)

-        finished_ids, finished_scores = layers.beam_search_decode(ids, scores)
+        finished_ids, finished_scores = layers.beam_search_decode(ids, scores,
+                                                                  eos_idx)
        return finished_ids, finished_scores

    finished_ids, finished_scores = beam_search()