提交 82ba5c03 编写于 作者: G guosheng

Make step outputs between fast_infer and the original python infer alignment in Transformer

上级 2b553441
...@@ -255,6 +255,8 @@ def translate_batch(exe, ...@@ -255,6 +255,8 @@ def translate_batch(exe,
predict_all = exe.run(decoder, predict_all = exe.run(decoder,
feed=dict(zip(dec_in_names, dec_in_data)), feed=dict(zip(dec_in_names, dec_in_data)),
fetch_list=dec_out_names)[0] fetch_list=dec_out_names)[0]
print predict_all.reshape(
[len(beam_inst_map) * beam_size, i + 1, -1])[:, -1, :]
predict_all = np.log( predict_all = np.log(
predict_all.reshape([len(beam_inst_map) * beam_size, i + 1, -1]) predict_all.reshape([len(beam_inst_map) * beam_size, i + 1, -1])
[:, -1, :]) [:, -1, :])
...@@ -273,11 +275,19 @@ def translate_batch(exe, ...@@ -273,11 +275,19 @@ def translate_batch(exe,
top_k_indice = np.argpartition(predict, -beam_size)[-beam_size:] top_k_indice = np.argpartition(predict, -beam_size)[-beam_size:]
top_scores_ids = top_k_indice[np.argsort(predict[top_k_indice])[:: top_scores_ids = top_k_indice[np.argsort(predict[top_k_indice])[::
-1]] -1]]
top_scores_ids = np.asarray(
sorted(
top_scores_ids,
lambda x, y: x / predict_all.shape[-1] - y / predict_all.shape[-1]
)) # sort by pre_branch and score to compare with fast_infer
top_scores = predict[top_scores_ids] top_scores = predict[top_scores_ids]
scores[beam_idx] = top_scores scores[beam_idx] = top_scores
prev_branchs[beam_idx].append(top_scores_ids / prev_branchs[beam_idx].append(top_scores_ids /
predict_all.shape[-1]) predict_all.shape[-1])
next_ids[beam_idx].append(top_scores_ids % predict_all.shape[-1]) next_ids[beam_idx].append(top_scores_ids % predict_all.shape[-1])
print prev_branchs[beam_idx][-1]
print next_ids[beam_idx][-1]
print top_scores
if next_ids[beam_idx][-1][0] != eos_idx: if next_ids[beam_idx][-1][0] != eos_idx:
active_beams.append(beam_idx) active_beams.append(beam_idx)
if len(active_beams) == 0: if len(active_beams) == 0:
...@@ -342,10 +352,8 @@ def infer(args): ...@@ -342,10 +352,8 @@ def infer(args):
fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params) fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params)
# This is used here to set dropout to the test mode. # This is used here to set dropout to the test mode.
encoder_program = fluid.io.get_inference_program( encoder_program = encoder_program.inference_optimize()
target_vars=[enc_output], main_program=encoder_program) decoder_program = decoder_program.inference_optimize()
decoder_program = fluid.io.get_inference_program(
target_vars=[predict], main_program=decoder_program)
test_data = reader.DataReader( test_data = reader.DataReader(
src_vocab_fpath=args.src_vocab_fpath, src_vocab_fpath=args.src_vocab_fpath,
...@@ -543,8 +551,10 @@ def fast_infer(args): ...@@ -543,8 +551,10 @@ def fast_infer(args):
for idx in np.array(seq_ids)[sub_start:sub_end] for idx in np.array(seq_ids)[sub_start:sub_end]
])) ]))
print hyps[i] print hyps[i]
print len(hyps[i]), [len(hyp.split()) for hyp in hyps[i]]
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
fast_infer(args) fast_infer(args)
# infer(args)
...@@ -6,6 +6,8 @@ import paddle.fluid.layers as layers ...@@ -6,6 +6,8 @@ import paddle.fluid.layers as layers
from config import * from config import *
FLAG = False
def position_encoding_init(n_position, d_pos_vec): def position_encoding_init(n_position, d_pos_vec):
""" """
...@@ -121,6 +123,15 @@ def multi_head_attention(queries, ...@@ -121,6 +123,15 @@ def multi_head_attention(queries,
act="softmax") act="softmax")
weights = layers.reshape( weights = layers.reshape(
x=weights, shape=product.shape, actual_shape=post_softmax_shape) x=weights, shape=product.shape, actual_shape=post_softmax_shape)
global FLAG
if FLAG:
print "hehehehehe"
layers.Print(scaled_q)
layers.Print(k)
layers.Print(v)
layers.Print(product)
layers.Print(weights)
FLAG = False
if dropout_rate: if dropout_rate:
weights = layers.dropout( weights = layers.dropout(
weights, dropout_prob=dropout_rate, is_test=False) weights, dropout_prob=dropout_rate, is_test=False)
...@@ -133,6 +144,13 @@ def multi_head_attention(queries, ...@@ -133,6 +144,13 @@ def multi_head_attention(queries,
if cache is not None: # use cache and concat time steps if cache is not None: # use cache and concat time steps
k = cache["k"] = layers.concat([cache["k"], k], axis=1) k = cache["k"] = layers.concat([cache["k"], k], axis=1)
v = cache["v"] = layers.concat([cache["v"], v], axis=1) v = cache["v"] = layers.concat([cache["v"], v], axis=1)
# global FLAG
# if FLAG:
# print "hehehehehe"
# layers.Print(q)
# layers.Print(k)
# layers.Print(v)
# FLAG = False
q = __split_heads(q, n_head) q = __split_heads(q, n_head)
k = __split_heads(k, n_head) k = __split_heads(k, n_head)
v = __split_heads(v, n_head) v = __split_heads(v, n_head)
...@@ -147,6 +165,11 @@ def multi_head_attention(queries, ...@@ -147,6 +165,11 @@ def multi_head_attention(queries,
param_attr=fluid.initializer.Xavier(uniform=False), param_attr=fluid.initializer.Xavier(uniform=False),
bias_attr=False, bias_attr=False,
num_flatten_dims=2) num_flatten_dims=2)
# global FLAG
# if FLAG:
# print "hehehehehe"
# layers.Print(proj_out)
# FLAG = False
return proj_out return proj_out
...@@ -377,6 +400,9 @@ def decoder(dec_input, ...@@ -377,6 +400,9 @@ def decoder(dec_input,
The decoder is composed of a stack of identical decoder_layer layers. The decoder is composed of a stack of identical decoder_layer layers.
""" """
for i in range(n_layer): for i in range(n_layer):
if i == 0: #n_layer-1:
global FLAG
FLAG = True
dec_output = decoder_layer( dec_output = decoder_layer(
dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias, n_head, dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias, n_head,
d_key, d_value, d_model, d_inner_hid, dropout_rate, d_key, d_value, d_model, d_inner_hid, dropout_rate,
...@@ -572,7 +598,7 @@ def wrap_decoder(trg_vocab_size, ...@@ -572,7 +598,7 @@ def wrap_decoder(trg_vocab_size,
bias_attr=False, bias_attr=False,
num_flatten_dims=2), num_flatten_dims=2),
shape=[-1, trg_vocab_size], shape=[-1, trg_vocab_size],
act="softmax" if dec_inputs is None else None) act="softmax") # if dec_inputs is None else None)
return predict return predict
...@@ -645,7 +671,8 @@ def fast_decode( ...@@ -645,7 +671,8 @@ def fast_decode(
"v": layers.sequence_expand( "v": layers.sequence_expand(
x=cache["v"], y=pre_scores), x=cache["v"], y=pre_scores),
} for cache in caches] } for cache in caches]
layers.Print(pre_ids) # layers.Print(pre_ids)
# layers.Print(pre_pos)
# layers.Print(pre_enc_output) # layers.Print(pre_enc_output)
# layers.Print(pre_src_attn_bias) # layers.Print(pre_src_attn_bias)
# layers.Print(pre_caches[0]["k"]) # layers.Print(pre_caches[0]["k"])
...@@ -667,9 +694,13 @@ def fast_decode( ...@@ -667,9 +694,13 @@ def fast_decode(
src_attn_pre_softmax_shape, src_attn_post_softmax_shape), src_attn_pre_softmax_shape, src_attn_post_softmax_shape),
enc_output=pre_enc_output, enc_output=pre_enc_output,
caches=pre_caches) caches=pre_caches)
layers.Print(logits)
topk_scores, topk_indices = layers.topk(logits, k=beam_size) topk_scores, topk_indices = layers.topk(logits, k=beam_size)
# layers.Print(topk_scores)
# layers.Print(topk_indices)
accu_scores = layers.elementwise_add( accu_scores = layers.elementwise_add(
x=layers.log(x=layers.softmax(topk_scores)), # x=layers.log(x=layers.softmax(topk_scores)),
x=layers.log(topk_scores),
y=layers.reshape( y=layers.reshape(
pre_scores, shape=[-1]), pre_scores, shape=[-1]),
axis=0) axis=0)
...@@ -690,8 +721,13 @@ def fast_decode( ...@@ -690,8 +721,13 @@ def fast_decode(
for i in range(n_layer): for i in range(n_layer):
layers.assign(pre_caches[i]["k"], caches[i]["k"]) layers.assign(pre_caches[i]["k"], caches[i]["k"])
layers.assign(pre_caches[i]["v"], caches[i]["v"]) layers.assign(pre_caches[i]["v"], caches[i]["v"])
layers.Print(selected_ids)
layers.Print(selected_scores)
# layers.Print(caches[-1]["k"])
layers.assign( layers.assign(
slf_attn_pre_softmax_shape + attn_pre_softmax_shape_delta, layers.elementwise_add(
x=slf_attn_pre_softmax_shape,
y=attn_pre_softmax_shape_delta),
slf_attn_pre_softmax_shape) slf_attn_pre_softmax_shape)
layers.assign( layers.assign(
layers.elementwise_add( layers.elementwise_add(
...@@ -703,7 +739,8 @@ def fast_decode( ...@@ -703,7 +739,8 @@ def fast_decode(
all_finish_cond = layers.less_than(x=step_idx, y=max_len) all_finish_cond = layers.less_than(x=step_idx, y=max_len)
layers.logical_or(x=max_len_cond, y=all_finish_cond, out=cond) layers.logical_or(x=max_len_cond, y=all_finish_cond, out=cond)
finished_ids, finished_scores = layers.beam_search_decode(ids, scores) finished_ids, finished_scores = layers.beam_search_decode(ids, scores,
eos_idx)
return finished_ids, finished_scores return finished_ids, finished_scores
finished_ids, finished_scores = beam_search() finished_ids, finished_scores = beam_search()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册