提交 1b82959a 编写于 作者: Y yangyaming

add addressing mechanism configuration, add training script and add generating script

上级 b31baa95
import paddle.v2 as paddle
from ntm_conf import gru_encoder_decoder
import gzip
import wmt14
def main():
paddle.init(use_gpu=False, trainer_count=1)
dict_size = 30000
is_hybrid_addressing = True
gen_creator = wmt14.gen(dict_size, src_seq_zero=is_hybrid_addressing)
gen_data = []
gen_num = 3
for item in gen_creator():
gen_data.append((item[0], item[1]))
if len(gen_data) == gen_num:
break
beam_gen = gru_encoder_decoder(
src_dict_dim=dict_size,
trg_dict_dim=dict_size,
is_generating=True,
is_hybrid_addressing=is_hybrid_addressing)
with gzip.open('./models/model_pass_00000.tar.gz') as f:
parameters = paddle.parameters.Parameters.from_tar(f)
beam_result = paddle.infer(
output_layer=beam_gen,
parameters=parameters,
input=gen_data,
field=['prob', 'id'])
src_dict, trg_dict = wmt14.get_dict(dict_size)
seq_list = []
seq = []
for w in beam_result[1]:
if w != -1:
seq.append(w)
else:
seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
seq = []
prob = beam_result[0]
beam_size = 3
for i in xrange(gen_num):
print "\n*******************************************************\n"
print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n"
for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + 1]
if __name__ == '__main__':
main()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import paddle.v2 as paddle
import sys
import math
def gru_encoder_decoder(src_dict_dim,
trg_dict_dim,
is_generating=False,
is_hybrid_addressing=True,
word_vec_dim=512,
latent_chain_dim=512,
beam_max_len=230,
beam_size=3):
src_word_id = paddle.layer.data(
name='source_language_word',
type=paddle.data_type.integer_value_sequence(src_dict_dim))
src_embedding = paddle.layer.embedding(
input=src_word_id,
size=word_vec_dim,
param_attr=paddle.attr.ParamAttr(
name='_source_language_embedding',
initial_std=1. / math.sqrt(word_vec_dim)))
# use bi-gru as encoder
src_forward = paddle.networks.simple_gru(
input=src_embedding, size=latent_chain_dim)
src_backward = paddle.networks.simple_gru(
input=src_embedding, size=latent_chain_dim, reverse=True)
encoder_vector = paddle.layer.concat(input=[src_forward, src_backward])
with paddle.layer.mixed(
size=latent_chain_dim, bias_attr=False,
act=paddle.activation.Linear()) as encoder_projected:
encoder_projected += paddle.layer.full_matrix_projection(
input=encoder_vector)
if is_hybrid_addressing:
attention_memory_init = paddle.layer.data(
name='init_attention_weights',
type=paddle.data_type.dense_vector(1))
# expand dense vector to sequence
expand_attention_memory_init = paddle.layer.expand(
input=attention_memory_init, expand_as=src_word_id, bias_attr=False)
# build decoder with/without addressing mechanism
def gru_decoder_with_attention(encoder_projected, current_word):
decoder_state_memory = paddle.layer.memory(
name='gru_decoder', size=latent_chain_dim, is_seq=False)
# get attention in this code section
with paddle.layer.mixed(
size=latent_chain_dim,
act=paddle.activation.Linear(),
bias_attr=False) as decoder_state_projected:
decoder_state_projected += paddle.layer.full_matrix_projection(
input=decoder_state_memory)
expand_decoder_state_projected = paddle.layer.expand(
input=decoder_state_projected,
expand_as=encoder_projected,
bias_attr=False)
with paddle.layer.mixed(
size=latent_chain_dim,
act=paddle.activation.Tanh(),
bias_attr=False) as attention_vecs:
attention_vecs += paddle.layer.identity_projection(
input=expand_decoder_state_projected)
attention_vecs += paddle.layer.identity_projection(
input=encoder_projected)
with paddle.layer.mixed(
name='attention_weights',
size=1,
act=paddle.activation.SequenceSoftmax(),
bias_attr=False) as attention_weights:
attention_weights += paddle.layer.full_matrix_projection(
input=attention_vecs)
if is_hybrid_addressing == False:
context_vectors = paddle.layer.scaling(
input=encoder_projected, weight=attention_weights)
else:
# save attention weights of last step
attention_weight_memory = paddle.layer.memory(
name='attention_weights',
size=1,
is_seq=True,
boot_layer=expand_attention_memory_init)
# interpolating weight
with paddle.layer.mixed(
size=1, act=paddle.activation.Sigmoid(),
bias_attr=False) as addressing_gate:
addressing_gate += paddle.layer.full_matrix_projection(
input=current_word)
expand_addressing_gate = paddle.layer.expand(
input=addressing_gate,
expand_as=encoder_projected,
bias_attr=False)
weight_interpolation = paddle.layer.interpolation(
input=[attention_weights, attention_weight_memory],
weight=expand_addressing_gate)
# convolution shift
with paddle.layer.mixed(
size=3,
act=paddle.activation.Softmax(),
bias_attr=paddle.attr.Param(
initial_std=0)) as shifting_weights:
shifting_weights += paddle.layer.full_matrix_projection(
input=current_word)
convolutional_shift = paddle.layer.conv_shift(
a=weight_interpolation, b=shifting_weights)
context_vectors = paddle.layer.scaling(
input=encoder_projected, weight=convolutional_shift)
# sum together to get context vector
context = paddle.layer.pooling(
input=context_vectors, pooling_type=paddle.pooling.Sum())
with paddle.layer.mixed(
size=latent_chain_dim * 3,
layer_attr=paddle.attr.ExtraAttr(
error_clipping_threshold=100.0)) as decoder_step_input:
decoder_step_input += paddle.layer.full_matrix_projection(
input=context)
decoder_step_input += paddle.layer.full_matrix_projection(
input=current_word)
gru_step = paddle.layer.gru_step(
name='gru_decoder',
input=decoder_step_input,
output_mem=decoder_state_memory,
size=latent_chain_dim)
with paddle.layer.mixed(
size=trg_dict_dim,
act=paddle.activation.Softmax(),
bias_attr=paddle.attr.Param(initial_std=0)) as out:
out += paddle.layer.full_matrix_projection(input=gru_step)
return out
decoder_group_name = 'decoder_group'
group_inputs = [
paddle.layer.StaticInputV2(input=encoder_projected, is_seq=True)
]
if not is_generating:
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(trg_dict_dim)),
size=word_vec_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(trg_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
else:
trg_embedding = paddle.layer.GeneratedInputV2(
size=trg_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vec_dim)
group_inputs.append(trg_embedding)
beam_gen = paddle.layer.beam_search(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=beam_max_len)
return beam_gen
import paddle.v2 as paddle
from ntm_conf import gru_encoder_decoder
import wmt14
import sys
import gzip
def main():
paddle.init(use_gpu=False, trainer_count=1, log_error_clipping=True)
dict_size = 30000
is_hybrid_addressing = True
cost = gru_encoder_decoder(
src_dict_dim=dict_size,
trg_dict_dim=dict_size,
is_generating=False,
is_hybrid_addressing=is_hybrid_addressing)
parameters = paddle.parameters.create(cost)
optimizer = paddle.optimizer.Adam(
learning_rate=5e-4,
regularization=paddle.optimizer.L2Regularization(rate=8e-4),
model_average=paddle.optimizer.ModelAverage(
average_window=0.5, max_average_window=2500),
learning_rate_decay_a=0.0,
learning_rate_decay_b=0.0,
gradient_clipping_threshold=25)
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
# define data reader
wmt14_reader = paddle.batch(
paddle.reader.shuffle(
wmt14.train(dict_size, src_seq_zero=is_hybrid_addressing),
buf_size=8192),
batch_size=5)
def event_handler(event):
if isinstance(event, paddle.event.EndPass):
model_name = './models/model_pass_%05d.tar.gz' % event.pass_id
print('Save model to %s !' % model_name)
with gzip.open(model_name, 'w') as f:
parameters.to_tar(f)
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print("\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics))
else:
sys.stdout.write('.')
sys.stdout.flush()
if event.batch_id % 100 == 0:
model_name = './models/model_pass_%05d.tar.gz' % event.pass_id
print('Save model to %s !' % model_name)
with gzip.open(model_name, 'w') as f:
parameters.to_tar(f)
if is_hybrid_addressing == True:
feeding = {
'source_language_word': 0,
'init_attention_weights': 1,
'target_language_word': 2,
'target_language_next_word': 3
}
else:
feeding = {
'source_language_word': 0,
'target_language_word': 1,
'target_language_next_word': 2
}
# start to train
trainer.train(
reader=wmt14_reader,
event_handler=event_handler,
num_passes=2,
feeding=feeding)
if __name__ == '__main__':
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册