提交 803ee976 编写于 作者: K kuke 提交者: Yibing

Add example for NMT without attention

上级 10ab8b5e
#!/usr/bin/env python
import sys
import gzip
import sqlite3
import paddle.v2 as paddle
### Parameters
word_vector_dim = 620
latent_chain_dim = 1000
beam_size = 3
max_length = 50
def seq2seq_net(source_dict_dim, target_dict_dim, generating=False):
decoder_size = encoder_size = latent_chain_dim
### Encoder
src_word_id = paddle.layer.data(
name='source_language_word',
type=paddle.data_type.integer_value_sequence(source_dict_dim))
src_embedding = paddle.layer.embedding(
input=src_word_id,
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
encoder_forward = paddle.networks.simple_gru(
input=src_embedding,
act=paddle.activation.Tanh(),
gate_act=paddle.activation.Sigmoid(),
size=encoder_size,
reverse=False)
encoder_backward = paddle.networks.simple_gru(
input=src_embedding,
act=paddle.activation.Tanh(),
gate_act=paddle.activation.Sigmoid(),
size=encoder_size,
reverse=True)
encoded_vector = paddle.layer.concat(
input=[encoder_forward, encoder_backward])
#### Decoder
encoder_last = paddle.layer.last_seq(input=encoded_vector)
with paddle.layer.mixed(
size=decoder_size,
act=paddle.activation.Tanh()) as encoder_last_projected:
encoder_last_projected += paddle.layer.full_matrix_projection(
input=encoder_last)
def gru_decoder_without_attention(enc_vec, current_word):
decoder_mem = paddle.layer.memory(
name='gru_decoder',
size=decoder_size,
boot_layer=encoder_last_projected)
context = paddle.layer.last_seq(input=enc_vec)
with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += paddle.layer.full_matrix_projection(input=context)
decoder_inputs += paddle.layer.full_matrix_projection(
input=current_word)
gru_step = paddle.layer.gru_step(
name='gru_decoder',
act=paddle.activation.Tanh(),
gate_act=paddle.activation.Sigmoid(),
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
with paddle.layer.mixed(
size=target_dict_dim,
bias_attr=True,
act=paddle.activation.Softmax()) as out:
out += paddle.layer.full_matrix_projection(input=gru_step)
return out
decoder_group_name = "decoder_group"
group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
group_inputs = [group_input1]
if not generating:
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped without attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_without_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
else:
# In generation, the decoder predicts a next target word based on
# the encoded source sequence and the last generated target word.
# The encoded source sequence (encoder's output) must be specified by
# StaticInput, which is a read-only memory.
# Embedding of the last generated word is automatically gotten by
# GeneratedInputs, which is initialized by a start mark, such as <s>,
# and must be included in generation.
trg_embedding = paddle.layer.GeneratedInputV2(
size=target_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vector_dim)
group_inputs.append(trg_embedding)
beam_gen = paddle.layer.beam_search(
name=decoder_group_name,
step=gru_decoder_without_attention,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=max_length)
return beam_gen
def train(source_dict_dim, target_dict_dim):
cost = seq2seq_net(source_dict_dim, target_dict_dim)
parameters = paddle.parameters.create(cost)
# define optimize method and trainer
optimizer = paddle.optimizer.RMSProp(
learning_rate=1e-3,
gradient_clipping_threshold=10.0,
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
# define data reader
wmt14_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(source_dict_dim), buf_size=8192),
batch_size=55)
# define event_handler callback
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0 and event.batch_id > 0:
with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' %
event.batch_id, 'w') as f:
parameters.to_tar(f)
if event.batch_id % 10 == 0:
# wmt14_test_batch = paddle.batch(
# paddle.reader.shuffle(
# paddle.dataset.wmt14.test(source_dict_dim),
# buf_size=8192), batch_size=1)
#test_result = trainer.test(wmt14_test_batch)
print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id,
event.batch_id,
event.cost,
event.metrics, # test_result.cost, test_result.metrics
)
else:
sys.stdout.write('.')
sys.stdout.flush()
# start to train
trainer.train(
reader=wmt14_reader, event_handler=event_handler, num_passes=2)
def generate(source_dict_dim, target_dict_dim):
# use the first 3 samples for generation
gen_creator = paddle.dataset.wmt14.gen(source_dict_dim)
gen_data = []
gen_num = 3
for item in gen_creator():
gen_data.append((item[0], ))
if len(gen_data) == gen_num:
break
beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True)
# get the pretrained model, whose bleu = 26.92
# parameters = paddle.dataset.wmt14.model()
with gzip.open('models/nmt_without_att_params_batch_400.tar.gz') as f:
parameters = paddle.parameters.Parameters.from_tar(f)
# prob is the prediction probabilities, and id is the prediction word.
beam_result = paddle.infer(
output_layer=beam_gen,
parameters=parameters,
input=gen_data,
field=['prob', 'id'])
# get the dictionary
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim)
# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list = []
seq = []
for w in beam_result[1]:
if w != -1:
seq.append(w)
else:
seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
seq = []
prob = beam_result[0]
beam_size = 3
for i in xrange(gen_num):
print "\n*******************************************************\n"
print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n"
for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
def main():
paddle.init(use_gpu=False, trainer_count=4)
source_language_dict_dim = 30000
target_language_dict_dim = 30000
generating = True
if generating:
generate(source_language_dict_dim, target_language_dict_dim)
else:
train(source_language_dict_dim, target_language_dict_dim)
if __name__ == '__main__':
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册