From 1b82959ae72b0116de688b172285eebe3c347eea Mon Sep 17 00:00:00 2001 From: yangyaming Date: Thu, 25 May 2017 11:41:37 +0800 Subject: [PATCH] add addressing mechanism configuration, add training script and add generating script --- ntm_addressing_mechanism/generate.py | 56 +++++++++ ntm_addressing_mechanism/ntm_conf.py | 180 +++++++++++++++++++++++++++ ntm_addressing_mechanism/train.py | 84 +++++++++++++ 3 files changed, 320 insertions(+) create mode 100644 ntm_addressing_mechanism/generate.py create mode 100644 ntm_addressing_mechanism/ntm_conf.py create mode 100644 ntm_addressing_mechanism/train.py diff --git a/ntm_addressing_mechanism/generate.py b/ntm_addressing_mechanism/generate.py new file mode 100644 index 00000000..1057d794 --- /dev/null +++ b/ntm_addressing_mechanism/generate.py @@ -0,0 +1,56 @@ +import paddle.v2 as paddle +from ntm_conf import gru_encoder_decoder +import gzip +import wmt14 + + +def main(): + paddle.init(use_gpu=False, trainer_count=1) + dict_size = 30000 + + is_hybrid_addressing = True + gen_creator = wmt14.gen(dict_size, src_seq_zero=is_hybrid_addressing) + gen_data = [] + gen_num = 3 + + for item in gen_creator(): + gen_data.append((item[0], item[1])) + if len(gen_data) == gen_num: + break + + beam_gen = gru_encoder_decoder( + src_dict_dim=dict_size, + trg_dict_dim=dict_size, + is_generating=True, + is_hybrid_addressing=is_hybrid_addressing) + + with gzip.open('./models/model_pass_00000.tar.gz') as f: + parameters = paddle.parameters.Parameters.from_tar(f) + + beam_result = paddle.infer( + output_layer=beam_gen, + parameters=parameters, + input=gen_data, + field=['prob', 'id']) + + src_dict, trg_dict = wmt14.get_dict(dict_size) + seq_list = [] + seq = [] + for w in beam_result[1]: + if w != -1: + seq.append(w) + else: + seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) + seq = [] + + prob = beam_result[0] + beam_size = 3 + for i in xrange(gen_num): + print "\n*******************************************************\n" + print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n" + for j in xrange(beam_size): + print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + 1] + + +if __name__ == '__main__': + main() diff --git a/ntm_addressing_mechanism/ntm_conf.py b/ntm_addressing_mechanism/ntm_conf.py new file mode 100644 index 00000000..13321385 --- /dev/null +++ b/ntm_addressing_mechanism/ntm_conf.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import paddle.v2 as paddle +import sys +import math + + +def gru_encoder_decoder(src_dict_dim, + trg_dict_dim, + is_generating=False, + is_hybrid_addressing=True, + word_vec_dim=512, + latent_chain_dim=512, + beam_max_len=230, + beam_size=3): + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(src_dict_dim)) + + src_embedding = paddle.layer.embedding( + input=src_word_id, + size=word_vec_dim, + param_attr=paddle.attr.ParamAttr( + name='_source_language_embedding', + initial_std=1. / math.sqrt(word_vec_dim))) + # use bi-gru as encoder + src_forward = paddle.networks.simple_gru( + input=src_embedding, size=latent_chain_dim) + src_backward = paddle.networks.simple_gru( + input=src_embedding, size=latent_chain_dim, reverse=True) + encoder_vector = paddle.layer.concat(input=[src_forward, src_backward]) + with paddle.layer.mixed( + size=latent_chain_dim, bias_attr=False, + act=paddle.activation.Linear()) as encoder_projected: + encoder_projected += paddle.layer.full_matrix_projection( + input=encoder_vector) + + if is_hybrid_addressing: + attention_memory_init = paddle.layer.data( + name='init_attention_weights', + type=paddle.data_type.dense_vector(1)) + # expand dense vector to sequence + expand_attention_memory_init = paddle.layer.expand( + input=attention_memory_init, expand_as=src_word_id, bias_attr=False) + + # build decoder with/without addressing mechanism + def gru_decoder_with_attention(encoder_projected, current_word): + decoder_state_memory = paddle.layer.memory( + name='gru_decoder', size=latent_chain_dim, is_seq=False) + + # get attention in this code section + with paddle.layer.mixed( + size=latent_chain_dim, + act=paddle.activation.Linear(), + bias_attr=False) as decoder_state_projected: + decoder_state_projected += paddle.layer.full_matrix_projection( + input=decoder_state_memory) + expand_decoder_state_projected = paddle.layer.expand( + input=decoder_state_projected, + expand_as=encoder_projected, + bias_attr=False) + with paddle.layer.mixed( + size=latent_chain_dim, + act=paddle.activation.Tanh(), + bias_attr=False) as attention_vecs: + attention_vecs += paddle.layer.identity_projection( + input=expand_decoder_state_projected) + attention_vecs += paddle.layer.identity_projection( + input=encoder_projected) + with paddle.layer.mixed( + name='attention_weights', + size=1, + act=paddle.activation.SequenceSoftmax(), + bias_attr=False) as attention_weights: + attention_weights += paddle.layer.full_matrix_projection( + input=attention_vecs) + + if is_hybrid_addressing == False: + context_vectors = paddle.layer.scaling( + input=encoder_projected, weight=attention_weights) + else: + # save attention weights of last step + attention_weight_memory = paddle.layer.memory( + name='attention_weights', + size=1, + is_seq=True, + boot_layer=expand_attention_memory_init) + + # interpolating weight + with paddle.layer.mixed( + size=1, act=paddle.activation.Sigmoid(), + bias_attr=False) as addressing_gate: + addressing_gate += paddle.layer.full_matrix_projection( + input=current_word) + expand_addressing_gate = paddle.layer.expand( + input=addressing_gate, + expand_as=encoder_projected, + bias_attr=False) + weight_interpolation = paddle.layer.interpolation( + input=[attention_weights, attention_weight_memory], + weight=expand_addressing_gate) + + # convolution shift + with paddle.layer.mixed( + size=3, + act=paddle.activation.Softmax(), + bias_attr=paddle.attr.Param( + initial_std=0)) as shifting_weights: + shifting_weights += paddle.layer.full_matrix_projection( + input=current_word) + convolutional_shift = paddle.layer.conv_shift( + a=weight_interpolation, b=shifting_weights) + context_vectors = paddle.layer.scaling( + input=encoder_projected, weight=convolutional_shift) + + # sum together to get context vector + context = paddle.layer.pooling( + input=context_vectors, pooling_type=paddle.pooling.Sum()) + + with paddle.layer.mixed( + size=latent_chain_dim * 3, + layer_attr=paddle.attr.ExtraAttr( + error_clipping_threshold=100.0)) as decoder_step_input: + decoder_step_input += paddle.layer.full_matrix_projection( + input=context) + decoder_step_input += paddle.layer.full_matrix_projection( + input=current_word) + + gru_step = paddle.layer.gru_step( + name='gru_decoder', + input=decoder_step_input, + output_mem=decoder_state_memory, + size=latent_chain_dim) + + with paddle.layer.mixed( + size=trg_dict_dim, + act=paddle.activation.Softmax(), + bias_attr=paddle.attr.Param(initial_std=0)) as out: + out += paddle.layer.full_matrix_projection(input=gru_step) + + return out + + decoder_group_name = 'decoder_group' + group_inputs = [ + paddle.layer.StaticInputV2(input=encoder_projected, is_seq=True) + ] + + if not is_generating: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(trg_dict_dim)), + size=word_vec_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) + lbl = paddle.layer.data( + name='target_language_next_word', + type=paddle.data_type.integer_value_sequence(trg_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + return cost + else: + trg_embedding = paddle.layer.GeneratedInputV2( + size=trg_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vec_dim) + group_inputs.append(trg_embedding) + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs, + bos_id=0, + eos_id=1, + beam_size=beam_size, + max_length=beam_max_len) + return beam_gen diff --git a/ntm_addressing_mechanism/train.py b/ntm_addressing_mechanism/train.py new file mode 100644 index 00000000..3c6324e5 --- /dev/null +++ b/ntm_addressing_mechanism/train.py @@ -0,0 +1,84 @@ +import paddle.v2 as paddle +from ntm_conf import gru_encoder_decoder +import wmt14 +import sys +import gzip + + +def main(): + paddle.init(use_gpu=False, trainer_count=1, log_error_clipping=True) + dict_size = 30000 + + is_hybrid_addressing = True + cost = gru_encoder_decoder( + src_dict_dim=dict_size, + trg_dict_dim=dict_size, + is_generating=False, + is_hybrid_addressing=is_hybrid_addressing) + + parameters = paddle.parameters.create(cost) + + optimizer = paddle.optimizer.Adam( + learning_rate=5e-4, + regularization=paddle.optimizer.L2Regularization(rate=8e-4), + model_average=paddle.optimizer.ModelAverage( + average_window=0.5, max_average_window=2500), + learning_rate_decay_a=0.0, + learning_rate_decay_b=0.0, + gradient_clipping_threshold=25) + + trainer = paddle.trainer.SGD( + cost=cost, parameters=parameters, update_equation=optimizer) + + # define data reader + wmt14_reader = paddle.batch( + paddle.reader.shuffle( + wmt14.train(dict_size, src_seq_zero=is_hybrid_addressing), + buf_size=8192), + batch_size=5) + + def event_handler(event): + if isinstance(event, paddle.event.EndPass): + model_name = './models/model_pass_%05d.tar.gz' % event.pass_id + print('Save model to %s !' % model_name) + with gzip.open(model_name, 'w') as f: + parameters.to_tar(f) + + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 10 == 0: + print("\nPass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics)) + else: + sys.stdout.write('.') + sys.stdout.flush() + + if event.batch_id % 100 == 0: + model_name = './models/model_pass_%05d.tar.gz' % event.pass_id + print('Save model to %s !' % model_name) + with gzip.open(model_name, 'w') as f: + parameters.to_tar(f) + + if is_hybrid_addressing == True: + feeding = { + 'source_language_word': 0, + 'init_attention_weights': 1, + 'target_language_word': 2, + 'target_language_next_word': 3 + } + else: + feeding = { + 'source_language_word': 0, + 'target_language_word': 1, + 'target_language_next_word': 2 + } + + # start to train + trainer.train( + reader=wmt14_reader, + event_handler=event_handler, + num_passes=2, + feeding=feeding) + + +if __name__ == '__main__': + main() -- GitLab