提交 ee1262d5 编写于 作者: C caoying03

update model config.

上级 d30a28c7
......@@ -38,9 +38,11 @@ def stacked_bidirectional_lstm(inputs, size, depth, drop_rate=0., prefix=""):
paddle.layer.last_seq(input=lstm_last[0]),
paddle.layer.first_seq(input=lstm_last[1]),
])
return final_states, paddle.layer.concat(
lstm_outs = paddle.layer.concat(
input=lstm_last,
layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=drop_rate), )
layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=drop_rate))
return final_states, lstm_outs
def lstm_by_nested_sequence(input_layer, hidden_dim, name="", reverse=False):
......@@ -70,8 +72,9 @@ def lstm_by_nested_sequence(input_layer, hidden_dim, name="", reverse=False):
name="__inner_state_%s__" % name,
size=hidden_dim,
boot_layer=outer_memory)
input_proj = paddle.layer.fc(
size=hidden_dim * 4, bias_attr=False, input=input_layer)
input_proj = paddle.layer.fc(size=hidden_dim * 4,
bias_attr=False,
input=input_layer)
return paddle.networks.lstmemory_unit(
input=input_proj,
name="__inner_state_%s__" % name,
......@@ -91,12 +94,12 @@ def lstm_by_nested_sequence(input_layer, hidden_dim, name="", reverse=False):
inner_last_output = paddle.layer.first_seq(
input=inner_out,
name="__inner_%s_last__" % name,
agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE)
agg_level=paddle.layer.AggregateLevel.TO_NO_SEQUENCE)
else:
inner_last_output = paddle.layer.last_seq(
input=inner_out,
name="__inner_%s_last__" % name,
agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE)
agg_level=paddle.layer.AggregateLevel.TO_NO_SEQUENCE)
return inner_out
return paddle.layer.recurrent_group(
......
......@@ -6,8 +6,8 @@ __all__ = ["ModelConfig"]
class ModelConfig(object):
beam_size = 3
vocab_size = 102400
embedding_dim = 256
vocab_size = 104808
embedding_dim = 300
embedding_droprate = 0.3
lstm_depth = 3
......@@ -17,9 +17,17 @@ class ModelConfig(object):
passage_indep_embedding_dim = 300
passage_aligned_embedding_dim = 128
beam_size = 5
beam_size = 32
dict_path = "data/featurized/vocab.txt"
pretrained_emb_path = "data/featurized/embeddings.npy"
class TrainerConfig(object):
learning_rate = 1e-3
data_dir = "data/featurized"
save_dir = "models"
batch_size = 12 * 4
epochs = 100
......@@ -10,15 +10,10 @@ from config import ModelConfig
__all__ = ["GNR"]
def build_pretrained_embedding(name,
data_type,
vocab_size,
emb_dim,
emb_drop=0.):
one_hot_input = paddle.layer.data(
name=name, type=paddle.data_type.integer_value_sequence(vocab_size))
def build_pretrained_embedding(name, data_type, emb_dim, emb_drop=0.):
return paddle.layer.embedding(
input=one_hot_input,
input=paddle.layer.data(
name=name, type=data_type),
size=emb_dim,
param_attr=paddle.attr.Param(
name="GloveVectors", is_static=True),
......@@ -112,25 +107,24 @@ def encode_documents(input_embedding, same_as_question, question_vector,
])
def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config):
def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config,
is_infer):
last_state_of_sentence = paddle.layer.last_seq(
input=doc_lstm_outs, agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE)
# HERE do not use sequence softmax activition.
sentence_scores = paddle.layer.fc(input=last_state_of_sentence,
size=1,
act=paddle.activation.Exp())
act=paddle.activation.Linear())
topk_sentence_ids = paddle.layer.kmax_sequence_score(
input=sentence_scores, beam_size=config.beam_size)
topk_sen = paddle.layer.sub_nested_seq(
input=last_state_of_sentence, selected_indices=topk_sentence_ids)
input=doc_lstm_outs, selected_indices=topk_sentence_ids)
# expand beam to search start positions on selected sentences
start_pos_scores = paddle.layer.fc(input=topk_sen,
size=1,
act=paddle.activation.Exp())
act=paddle.activation.Linear())
topk_start_pos_ids = paddle.layer.kmax_sequence_score(
input=sentence_scores, beam_size=config.beam_size)
input=start_pos_scores, beam_size=config.beam_size)
topk_start_spans = paddle.layer.seq_slice(
input=topk_sen, starts=topk_start_pos_ids, ends=None)
......@@ -143,33 +137,40 @@ def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config):
prefix="__end_span_embeddings__")
end_pos_scores = paddle.layer.fc(input=end_span_embedding,
size=1,
act=paddle.activation.Exp())
act=paddle.activation.Linear())
topk_end_pos_ids = paddle.layer.kmax_sequence_score(
input=end_pos_scores, beam_size=config.beam_size)
cost = paddle.layer.cross_entropy_over_beam(
input=[
sentence_scores, topk_sentence_ids, start_pos_scores,
topk_start_pos_ids, end_pos_scores, topk_end_pos_ids
],
label=[sentence_idx, start_idx, end_idx])
return cost
if is_infer:
return [topk_sentence_ids, topk_start_pos_ids, topk_end_pos_ids]
else:
return paddle.layer.cross_entropy_over_beam(input=[
paddle.layer.BeamInput(sentence_scores, topk_sentence_ids,
sentence_idx),
paddle.layer.BeamInput(start_pos_scores, topk_start_pos_ids,
start_idx),
paddle.layer.BeamInput(end_pos_scores, topk_end_pos_ids, end_idx)
])
def GNR(config):
def GNR(config, is_infer=False):
# encoding question words
question_embeddings = build_pretrained_embedding(
"question", paddle.data_type.integer_value_sequence, config.vocab_size,
"question",
paddle.data_type.integer_value_sequence(config.vocab_size),
config.embedding_dim, config.embedding_droprate)
question_vector, question_lstm_outs = encode_question(
input_embedding=question_embeddings, config=config, prefix="__ques")
# encoding document words
document_embeddings = build_pretrained_embedding(
"documents", paddle.data_type.integer_value_sub_sequence,
config.vocab_size, config.embedding_dim, config.embedding_droprate)
"documents",
paddle.data_type.integer_value_sub_sequence(config.vocab_size),
config.embedding_dim, config.embedding_droprate)
same_as_question = paddle.layer.data(
name="same_as_question",
type=paddle.data_type.integer_value_sub_sequence(2))
type=paddle.data_type.dense_vector_sub_sequence(1))
document_words_ecoding = encode_documents(
input_embedding=document_embeddings,
question_vector=question_vector,
......@@ -192,7 +193,7 @@ def GNR(config):
end_idx = paddle.layer.data(
name="end_idx", type=paddle.data_type.integer_value(1))
return search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx,
config)
config, is_infer)
if __name__ == "__main__":
......
#!/usr/bin/env python
#coding=utf-8
import pdb
import os
import random
import json
import logging
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
def train_reader(data_list, is_train=True):
......@@ -14,22 +17,20 @@ def train_reader(data_list, is_train=True):
for train_sample in data_list:
data = json.load(open(train_sample, "r"))
sent_len = data['sent_lengths']
doc_len = len(data['context'])
same_as_question_word = [[[x]]
for x in data['same_as_question_word']]
ans_sentence = [0] * doc_len
ans_sentence[data['ans_sentence']] = 1
ans_start = [0] * doc_len
ans_start[data['ans_start']] = 1
ans_end = [0] * doc_len
ans_end[data['ans_end']] = 1
yield (data['question'], data['context'], same_as_question_word,
ans_sentence, ans_start, ans_end)
start_pos = 0
doc = []
same_as_question_word = []
for l in data['sent_lengths']:
doc.append(data['context'][start_pos:start_pos + l])
same_as_question_word.append([
[[x]] for x in data['same_as_question_word']
][start_pos:start_pos + l])
start_pos += l
yield (data['question'], doc, same_as_question_word,
data['ans_sentence'], data['ans_start'],
data['ans_end'] - data['ans_start'])
return reader
......
......@@ -9,6 +9,7 @@ import logging
import random
import glob
import gzip
import numpy as np
import reader
import paddle.v2 as paddle
......@@ -21,7 +22,7 @@ logger.setLevel(logging.INFO)
def load_pretrained_parameters(path, height, width):
return
return np.load(path)
def save_model(save_path, parameters):
......@@ -51,27 +52,30 @@ def choose_samples(path):
train_samples.sort()
valid_samples.sort()
random.shuffle(train_samples)
# random.shuffle(train_samples)
return train_samples, valid_samples
def build_reader(data_dir):
def build_reader(data_dir, batch_size):
"""
Build the data reader for this model.
"""
train_samples, valid_samples = choose_samples(data_dir)
pdb.set_trace()
train_reader = paddle.batch(
paddle.reader.shuffle(
reader.train_reader(train_samples), buf_size=102400),
batch_size=config.batch_size)
batch_size=batch_size)
# train_reader = paddle.batch(
# reader.train_reader(train_samples), batch_size=batch_size)
# testing data is not shuffled
test_reader = paddle.batch(
reader.train_reader(valid_samples, is_train=False),
batch_size=config.batch_size)
reader.train_reader(
valid_samples, is_train=False),
batch_size=batch_size)
return train_reader, test_reader
......@@ -85,53 +89,65 @@ def build_event_handler(config, parameters, trainer, test_reader):
"""The event handler."""
if isinstance(event, paddle.event.EndIteration):
if (not event.batch_id % 100) and event.batch_id:
save_model("checkpoint_param.latest.tar.gz", parameters)
save_path = os.path.join(config.save_dir,
"checkpoint_param.latest.tar.gz")
save_model(save_path, parameters)
if not event.batch_id % 5:
if not event.batch_id % 1:
logger.info(
"Pass %d, Batch %d, Cost %f, %s" %
(event.pass_id, event.batch_id, event.cost, event.metrics))
if isinstance(event, paddle.event.EndPass):
save_model(config.param_save_filename_format % event.pass_id,
parameters)
with gzip.open(param_path, 'w') as handle:
parameters.to_tar(handle)
save_path = os.path.join(config.save_dir,
"pass_%05d.tar.gz" % event.pass_id)
save_model(save_path, parameters)
result = trainer.test(reader=test_reader)
logger.info("Test with Pass %d, %s" %
(event.pass_id, result.metrics))
# result = trainer.test(reader=test_reader)
# logger.info("Test with Pass %d, %s" %
# (event.pass_id, result.metrics))
return event_handler
def train(model_config, trainer_config):
paddle.init(use_gpu=True, trainer_count=1)
if not os.path.exists(trainer_config.save_dir):
os.mkdir(trainer_config.save_dir)
paddle.init(use_gpu=True, trainer_count=4)
# define the optimizer
optimizer = paddle.optimizer.Adam(
learning_rate=trainer_config.learning_rate,
regularization=paddle.optimizer.L2Regularization(rate=1e-3),
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
# model_average=paddle.optimizer.ModelAverage(average_window=0.5))
)
# define network topology
losses = GNR(model_config)
parameters = paddle.parameters.create(losses)
# print(parse_network(losses))
trainer = paddle.trainer.SGD(
cost=losses, parameters=parameters, update_equation=optimizer)
"""
parameters.set('GloveVectors',
load_pretrained_parameters(parameter_path, height, width))
"""
loss = GNR(model_config)
# print(parse_network(loss))
parameters = paddle.parameters.create(loss)
parameters.set("GloveVectors",
load_pretrained_parameters(
ModelConfig.pretrained_emb_path,
height=ModelConfig.vocab_size,
width=ModelConfig.embedding_dim))
trainer = paddle.trainer.SGD(cost=loss,
parameters=parameters,
update_equation=optimizer)
# define data reader
train_reader, test_reader = build_reader(trainer_config.data_dir)
train_reader, test_reader = build_reader(trainer_config.data_dir,
trainer_config.batch_size)
event_handler = build_event_handler(conf, parameters, trainer, test_reader)
event_handler = build_event_handler(trainer_config, parameters, trainer,
test_reader)
trainer.train(
reader=train_reader,
num_passes=conf.epochs,
reader=data_reader,
num_passes=trainer_config.epochs,
event_handler=event_handler)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册