diff --git a/seq2seq/nmt_without_attention_v2.py b/seq2seq/nmt_without_attention_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..6ad282ace5c0b09854dbf847e83d79137dee9e71 --- /dev/null +++ b/seq2seq/nmt_without_attention_v2.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python + +import sys +import gzip +import sqlite3 +import paddle.v2 as paddle + +### Parameters +word_vector_dim = 620 +latent_chain_dim = 1000 + +beam_size = 3 +max_length = 50 + + +def seq2seq_net(source_dict_dim, target_dict_dim, generating=False): + decoder_size = encoder_size = latent_chain_dim + + ### Encoder + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(source_dict_dim)) + + src_embedding = paddle.layer.embedding( + input=src_word_id, + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) + + encoder_forward = paddle.networks.simple_gru( + input=src_embedding, + act=paddle.activation.Tanh(), + gate_act=paddle.activation.Sigmoid(), + size=encoder_size, + reverse=False) + encoder_backward = paddle.networks.simple_gru( + input=src_embedding, + act=paddle.activation.Tanh(), + gate_act=paddle.activation.Sigmoid(), + size=encoder_size, + reverse=True) + encoded_vector = paddle.layer.concat( + input=[encoder_forward, encoder_backward]) + + #### Decoder + encoder_last = paddle.layer.last_seq(input=encoded_vector) + with paddle.layer.mixed( + size=decoder_size, + act=paddle.activation.Tanh()) as encoder_last_projected: + encoder_last_projected += paddle.layer.full_matrix_projection( + input=encoder_last) + + def gru_decoder_without_attention(enc_vec, current_word): + + decoder_mem = paddle.layer.memory( + name='gru_decoder', + size=decoder_size, + boot_layer=encoder_last_projected) + + context = paddle.layer.last_seq(input=enc_vec) + + with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: + decoder_inputs += paddle.layer.full_matrix_projection(input=context) + decoder_inputs += paddle.layer.full_matrix_projection( + input=current_word) + + gru_step = paddle.layer.gru_step( + name='gru_decoder', + act=paddle.activation.Tanh(), + gate_act=paddle.activation.Sigmoid(), + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + + with paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax()) as out: + out += paddle.layer.full_matrix_projection(input=gru_step) + return out + + decoder_group_name = "decoder_group" + group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True) + group_inputs = [group_input1] + + if not generating: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) + + # For decoder equipped without attention mechanism, in training, + # target embeding (the groudtruth) is the data input, + # while encoded source sequence is accessed to as an unbounded memory. + # Here, the StaticInput defines a read-only memory + # for the recurrent_group. + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_without_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name='target_language_next_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost + else: + # In generation, the decoder predicts a next target word based on + # the encoded source sequence and the last generated target word. + + # The encoded source sequence (encoder's output) must be specified by + # StaticInput, which is a read-only memory. + # Embedding of the last generated word is automatically gotten by + # GeneratedInputs, which is initialized by a start mark, such as , + # and must be included in generation. + + trg_embedding = paddle.layer.GeneratedInputV2( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) + group_inputs.append(trg_embedding) + + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_without_attention, + input=group_inputs, + bos_id=0, + eos_id=1, + beam_size=beam_size, + max_length=max_length) + + return beam_gen + + +def train(source_dict_dim, target_dict_dim): + cost = seq2seq_net(source_dict_dim, target_dict_dim) + parameters = paddle.parameters.create(cost) + + # define optimize method and trainer + optimizer = paddle.optimizer.RMSProp( + learning_rate=1e-3, + gradient_clipping_threshold=10.0, + regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + trainer = paddle.trainer.SGD( + cost=cost, parameters=parameters, update_equation=optimizer) + # define data reader + wmt14_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(source_dict_dim), buf_size=8192), + batch_size=55) + + # define event_handler callback + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0 and event.batch_id > 0: + with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' % + event.batch_id, 'w') as f: + parameters.to_tar(f) + + if event.batch_id % 10 == 0: + # wmt14_test_batch = paddle.batch( + # paddle.reader.shuffle( + # paddle.dataset.wmt14.test(source_dict_dim), + # buf_size=8192), batch_size=1) + + #test_result = trainer.test(wmt14_test_batch) + print "\nPass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, + event.batch_id, + event.cost, + event.metrics, # test_result.cost, test_result.metrics + ) + else: + sys.stdout.write('.') + sys.stdout.flush() + + # start to train + trainer.train( + reader=wmt14_reader, event_handler=event_handler, num_passes=2) + + +def generate(source_dict_dim, target_dict_dim): + # use the first 3 samples for generation + gen_creator = paddle.dataset.wmt14.gen(source_dict_dim) + gen_data = [] + gen_num = 3 + for item in gen_creator(): + gen_data.append((item[0], )) + if len(gen_data) == gen_num: + break + + beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) + # get the pretrained model, whose bleu = 26.92 + # parameters = paddle.dataset.wmt14.model() + with gzip.open('models/nmt_without_att_params_batch_400.tar.gz') as f: + parameters = paddle.parameters.Parameters.from_tar(f) + # prob is the prediction probabilities, and id is the prediction word. + beam_result = paddle.infer( + output_layer=beam_gen, + parameters=parameters, + input=gen_data, + field=['prob', 'id']) + + # get the dictionary + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) + + # the delimited element of generated sequences is -1, + # the first element of each generated sequence is the sequence length + seq_list = [] + seq = [] + for w in beam_result[1]: + if w != -1: + seq.append(w) + else: + seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) + seq = [] + + prob = beam_result[0] + beam_size = 3 + for i in xrange(gen_num): + print "\n*******************************************************\n" + print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n" + for j in xrange(beam_size): + print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] + + +def main(): + paddle.init(use_gpu=False, trainer_count=4) + source_language_dict_dim = 30000 + target_language_dict_dim = 30000 + generating = True + + if generating: + generate(source_language_dict_dim, target_language_dict_dim) + else: + train(source_language_dict_dim, target_language_dict_dim) + + +if __name__ == '__main__': + main()