提交 d30a28c7 编写于 作者: C caoying03

proj init.

上级 d42adaca
#!/usr/bin/env python
#coding=utf-8
import pdb
import collections
import paddle.v2 as paddle
from paddle.v2.layer import parse_network
__all__ = [
"stacked_bidirectional_lstm",
"lstm_by_nested_sequence",
]
def stacked_bidirectional_lstm(inputs, size, depth, drop_rate=0., prefix=""):
if not isinstance(inputs, collections.Sequence):
inputs = [inputs]
lstm_last = []
for dirt in ["fwd", "bwd"]:
for i in range(depth):
input_proj = paddle.layer.mixed(
name="%s_in_proj_%0d_%s__" % (prefix, i, dirt),
size=size * 4,
bias_attr=paddle.attr.Param(initial_std=0.),
input=[paddle.layer.full_matrix_projection(lstm)] if i else [
paddle.layer.full_matrix_projection(in_layer)
for in_layer in inputs
])
lstm = paddle.layer.lstmemory(
input=input_proj,
bias_attr=paddle.attr.Param(initial_std=0.),
param_attr=paddle.attr.Param(initial_std=5e-4),
reverse=(dirt == "bwd"))
lstm_last.append(lstm)
final_states = paddle.layer.concat(input=[
paddle.layer.last_seq(input=lstm_last[0]),
paddle.layer.first_seq(input=lstm_last[1]),
])
return final_states, paddle.layer.concat(
input=lstm_last,
layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=drop_rate), )
def lstm_by_nested_sequence(input_layer, hidden_dim, name="", reverse=False):
'''
This is a LSTM implemended by nested recurrent_group.
Paragraph is a nature nested sequence:
1. each paragraph is a sequence of sentence.
2. each sentence is a sequence of words.
This function ueses the nested recurrent_group to implement LSTM.
1. The outer group iterates over sentence in a paragraph.
2. The inner group iterates over words in a sentence.
3. A LSTM is used to encode sentence, its final outputs is used to
initialize memory of the LSTM that is used to encode the next sentence.
4. Parameters are shared among these sentence-encoding LSTMs.
5. Consequently, this function is just equivalent to concatenate all
sentences in a paragraph into one (long) sentence, and use one LSTM to
encode this new long sentence.
'''
def lstm_outer_step(lstm_group_input, hidden_dim, reverse, name=''):
outer_memory = paddle.layer.memory(
name="__inner_%s_last__" % name, size=hidden_dim)
def lstm_inner_step(input_layer, hidden_dim, reverse, name):
inner_memory = paddle.layer.memory(
name="__inner_state_%s__" % name,
size=hidden_dim,
boot_layer=outer_memory)
input_proj = paddle.layer.fc(
size=hidden_dim * 4, bias_attr=False, input=input_layer)
return paddle.networks.lstmemory_unit(
input=input_proj,
name="__inner_state_%s__" % name,
out_memory=inner_memory,
size=hidden_dim,
act=paddle.activation.Tanh(),
gate_act=paddle.activation.Sigmoid(),
state_act=paddle.activation.Tanh())
inner_out = paddle.layer.recurrent_group(
name="__inner_%s__" % name,
step=lstm_inner_step,
reverse=reverse,
input=[lstm_group_input, hidden_dim, reverse, name])
if reverse:
inner_last_output = paddle.layer.first_seq(
input=inner_out,
name="__inner_%s_last__" % name,
agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE)
else:
inner_last_output = paddle.layer.last_seq(
input=inner_out,
name="__inner_%s_last__" % name,
agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE)
return inner_out
return paddle.layer.recurrent_group(
input=[
paddle.layer.SubsequenceInput(input_layer), hidden_dim, reverse,
name
],
step=lstm_outer_step,
name="__outter_%s__" % name,
reverse=reverse)
def stacked_bi_lstm_by_nested_seq(input_layer, depth, hidden_dim, prefix=""):
lstm_final_outs = []
for dirt in ["fwd", "bwd"]:
for i in range(depth):
lstm_out = lstm_by_nested_sequence(
input_layer=(lstm_out if i else input_layer),
hidden_dim=hidden_dim,
name="__%s_%s_%02d__" % (prefix, dirt, i),
reverse=(dirt == "bwd"))
lstm_final_outs.append(lstm_out)
return paddle.layer.concat(input=lstm_final_outs)
if __name__ == "__main__":
vocab_size = 1024
emb_dim = 128
embedding = paddle.layer.embedding(
input=paddle.layer.data(
name="word",
type=paddle.data_type.integer_value_sub_sequence(vocab_size)),
size=emb_dim)
print(parse_network(
stacked_bi_lstm_by_nested_seq(
input_layer=embedding, depth=3, hidden_dim=128, prefix="__lstm")))
#!/usr/bin/env python
#coding=utf-8
__all__ = ["ModelConfig"]
class ModelConfig(object):
beam_size = 3
vocab_size = 102400
embedding_dim = 256
embedding_droprate = 0.3
lstm_depth = 3
lstm_hidden_dim = 300
lstm_hidden_droprate = 0.3
passage_indep_embedding_dim = 300
passage_aligned_embedding_dim = 128
beam_size = 5
class TrainerConfig(object):
learning_rate = 1e-3
data_dir = "data/featurized"
#!/usr/bin/env python
#coding=utf-8
import pdb
import paddle.v2 as paddle
from paddle.v2.layer import parse_network
import basic_modules
from config import ModelConfig
__all__ = ["GNR"]
def build_pretrained_embedding(name,
data_type,
vocab_size,
emb_dim,
emb_drop=0.):
one_hot_input = paddle.layer.data(
name=name, type=paddle.data_type.integer_value_sequence(vocab_size))
return paddle.layer.embedding(
input=one_hot_input,
size=emb_dim,
param_attr=paddle.attr.Param(
name="GloveVectors", is_static=True),
layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=emb_drop), )
def encode_question(input_embedding, config, prefix):
lstm_final, lstm_outs = basic_modules.stacked_bidirectional_lstm(
inputs=input_embedding,
size=config.lstm_hidden_dim,
depth=config.lstm_depth,
drop_rate=config.lstm_hidden_droprate,
prefix=prefix)
# passage-independent embeddings
candidates = paddle.layer.fc(input=lstm_outs,
bias_attr=False,
size=config.passage_indep_embedding_dim,
act=paddle.activation.Linear())
weights = paddle.layer.fc(input=lstm_outs,
size=1,
act=paddle.activation.SequenceSoftmax())
weighted_candidates = paddle.layer.scaling(input=candidates, weight=weights)
passage_indep_embedding = paddle.layer.pooling(
input=weighted_candidates, pooling_type=paddle.pooling.Sum())
return paddle.layer.concat(
input=[lstm_final, passage_indep_embedding]), lstm_outs
def question_aligned_passage_embedding(question_lstm_outs, document_embeddings,
config):
def outer_sentence_step(document_embeddings, question_lstm_outs, config):
'''
in this recurrent_group, document_embeddings has scattered into sequence,
'''
def inner_word_step(word_embedding, question_lstm_outs,
question_outs_proj, config):
'''
in this recurrent_group, sentence embedding has scattered into word
embeddings.
'''
doc_word_expand = paddle.layer.expand(
input=word_embedding,
expand_as=question_lstm_outs,
expand_level=paddle.layer.ExpandLevel.FROM_NO_SEQUENCE)
weights = paddle.layer.fc(
input=[question_lstm_outs, doc_word_expand],
size=1,
act=paddle.activation.SequenceSoftmax())
weighted_candidates = paddle.layer.scaling(
input=question_outs_proj, weight=weights)
return paddle.layer.pooling(
input=weighted_candidates, pooling_type=paddle.pooling.Sum())
question_outs_proj = paddle.layer.fc(
input=question_lstm_outs,
bias_attr=False,
size=config.passage_aligned_embedding_dim)
return paddle.layer.recurrent_group(
input=[
paddle.layer.SubsequenceInput(document_embeddings),
paddle.layer.StaticInput(question_lstm_outs),
paddle.layer.StaticInput(question_outs_proj),
config,
],
step=inner_word_step,
name="iter_over_word")
return paddle.layer.recurrent_group(
input=[
paddle.layer.SubsequenceInput(document_embeddings),
paddle.layer.StaticInput(question_lstm_outs), config
],
step=outer_sentence_step,
name="iter_over_sen")
def encode_documents(input_embedding, same_as_question, question_vector,
question_lstm_outs, config, prefix):
question_expanded = paddle.layer.expand(
input=question_vector,
expand_as=input_embedding,
expand_level=paddle.layer.ExpandLevel.FROM_NO_SEQUENCE)
question_aligned_embedding = question_aligned_passage_embedding(
question_lstm_outs, input_embedding, config)
return paddle.layer.concat(input=[
input_embedding, question_expanded, same_as_question,
question_aligned_embedding
])
def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config):
last_state_of_sentence = paddle.layer.last_seq(
input=doc_lstm_outs, agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE)
# HERE do not use sequence softmax activition.
sentence_scores = paddle.layer.fc(input=last_state_of_sentence,
size=1,
act=paddle.activation.Exp())
topk_sentence_ids = paddle.layer.kmax_sequence_score(
input=sentence_scores, beam_size=config.beam_size)
topk_sen = paddle.layer.sub_nested_seq(
input=last_state_of_sentence, selected_indices=topk_sentence_ids)
# expand beam to search start positions on selected sentences
start_pos_scores = paddle.layer.fc(input=topk_sen,
size=1,
act=paddle.activation.Exp())
topk_start_pos_ids = paddle.layer.kmax_sequence_score(
input=sentence_scores, beam_size=config.beam_size)
topk_start_spans = paddle.layer.seq_slice(
input=topk_sen, starts=topk_start_pos_ids, ends=None)
# expand beam to search end positions on selected start spans
_, end_span_embedding = basic_modules.stacked_bidirectional_lstm(
inputs=topk_start_spans,
size=config.lstm_hidden_dim,
depth=config.lstm_depth,
drop_rate=config.lstm_hidden_droprate,
prefix="__end_span_embeddings__")
end_pos_scores = paddle.layer.fc(input=end_span_embedding,
size=1,
act=paddle.activation.Exp())
topk_end_pos_ids = paddle.layer.kmax_sequence_score(
input=end_pos_scores, beam_size=config.beam_size)
cost = paddle.layer.cross_entropy_over_beam(
input=[
sentence_scores, topk_sentence_ids, start_pos_scores,
topk_start_pos_ids, end_pos_scores, topk_end_pos_ids
],
label=[sentence_idx, start_idx, end_idx])
return cost
def GNR(config):
# encoding question words
question_embeddings = build_pretrained_embedding(
"question", paddle.data_type.integer_value_sequence, config.vocab_size,
config.embedding_dim, config.embedding_droprate)
question_vector, question_lstm_outs = encode_question(
input_embedding=question_embeddings, config=config, prefix="__ques")
# encoding document words
document_embeddings = build_pretrained_embedding(
"documents", paddle.data_type.integer_value_sub_sequence,
config.vocab_size, config.embedding_dim, config.embedding_droprate)
same_as_question = paddle.layer.data(
name="same_as_question",
type=paddle.data_type.integer_value_sub_sequence(2))
document_words_ecoding = encode_documents(
input_embedding=document_embeddings,
question_vector=question_vector,
question_lstm_outs=question_lstm_outs,
same_as_question=same_as_question,
config=config,
prefix="__doc")
doc_lstm_outs = basic_modules.stacked_bi_lstm_by_nested_seq(
input_layer=document_words_ecoding,
hidden_dim=config.lstm_hidden_dim,
depth=config.lstm_depth,
prefix="__doc_lstm")
# define labels
sentence_idx = paddle.layer.data(
name="sen_idx", type=paddle.data_type.integer_value(1))
start_idx = paddle.layer.data(
name="start_idx", type=paddle.data_type.integer_value(1))
end_idx = paddle.layer.data(
name="end_idx", type=paddle.data_type.integer_value(1))
return search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx,
config)
if __name__ == "__main__":
print(parse_network(GNR(ModelConfig)))
#!/usr/bin/env python
#coding=utf-8
import pdb
import os
import random
import json
def train_reader(data_list, is_train=True):
def reader():
# every pass shuffle the data list again
if is_train:
random.shuffle(data_list)
for train_sample in data_list:
data = json.load(open(train_sample, "r"))
sent_len = data['sent_lengths']
doc_len = len(data['context'])
same_as_question_word = [[[x]]
for x in data['same_as_question_word']]
ans_sentence = [0] * doc_len
ans_sentence[data['ans_sentence']] = 1
ans_start = [0] * doc_len
ans_start[data['ans_start']] = 1
ans_end = [0] * doc_len
ans_end[data['ans_end']] = 1
yield (data['question'], data['context'], same_as_question_word,
ans_sentence, ans_start, ans_end)
return reader
if __name__ == "__main__":
from train import choose_samples
train_list, dev_list = choose_samples("data/featurized")
for i, item in enumerate(train_reader(train_list)()):
print(item)
if i > 5: break
#!/usr/bin/env python
#coding=utf-8
from __future__ import print_function
import pdb
import os
import sys
import logging
import random
import glob
import gzip
import reader
import paddle.v2 as paddle
from paddle.v2.layer import parse_network
from model import GNR
from config import ModelConfig, TrainerConfig
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
def load_pretrained_parameters(path, height, width):
return
def save_model(save_path, parameters):
with gzip.open(save_path, "w") as f:
parameters.to_tar(f)
def load_initial_model(model_path, parameters):
with gzip.open(model_path, "rb") as f:
parameters.init_from_tar(f)
def choose_samples(path):
"""
Load filenames for train, dev, and augmented samples.
"""
if not os.path.exists(os.path.join(path, "train")):
print(
"Non-existent directory as input path: {}".format(path),
file=sys.stderr)
sys.exit(1)
# Get paths to all samples that we want to load.
train_samples = glob.glob(os.path.join(path, "train", "*"))
valid_samples = glob.glob(os.path.join(path, "dev", "*"))
train_samples.sort()
valid_samples.sort()
random.shuffle(train_samples)
return train_samples, valid_samples
def build_reader(data_dir):
"""
Build the data reader for this model.
"""
train_samples, valid_samples = choose_samples(data_dir)
pdb.set_trace()
train_reader = paddle.batch(
paddle.reader.shuffle(
reader.train_reader(train_samples), buf_size=102400),
batch_size=config.batch_size)
# testing data is not shuffled
test_reader = paddle.batch(
reader.train_reader(valid_samples, is_train=False),
batch_size=config.batch_size)
return train_reader, test_reader
def build_event_handler(config, parameters, trainer, test_reader):
"""
Build the event handler for this model.
"""
# End batch and end pass event handler
def event_handler(event):
"""The event handler."""
if isinstance(event, paddle.event.EndIteration):
if (not event.batch_id % 100) and event.batch_id:
save_model("checkpoint_param.latest.tar.gz", parameters)
if not event.batch_id % 5:
logger.info(
"Pass %d, Batch %d, Cost %f, %s" %
(event.pass_id, event.batch_id, event.cost, event.metrics))
if isinstance(event, paddle.event.EndPass):
save_model(config.param_save_filename_format % event.pass_id,
parameters)
with gzip.open(param_path, 'w') as handle:
parameters.to_tar(handle)
result = trainer.test(reader=test_reader)
logger.info("Test with Pass %d, %s" %
(event.pass_id, result.metrics))
return event_handler
def train(model_config, trainer_config):
paddle.init(use_gpu=True, trainer_count=1)
# define the optimizer
optimizer = paddle.optimizer.Adam(
learning_rate=trainer_config.learning_rate,
regularization=paddle.optimizer.L2Regularization(rate=1e-3),
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
# define network topology
losses = GNR(model_config)
parameters = paddle.parameters.create(losses)
# print(parse_network(losses))
trainer = paddle.trainer.SGD(
cost=losses, parameters=parameters, update_equation=optimizer)
"""
parameters.set('GloveVectors',
load_pretrained_parameters(parameter_path, height, width))
"""
# define data reader
train_reader, test_reader = build_reader(trainer_config.data_dir)
event_handler = build_event_handler(conf, parameters, trainer, test_reader)
trainer.train(
reader=train_reader,
num_passes=conf.epochs,
event_handler=event_handler)
if __name__ == "__main__":
train(ModelConfig, TrainerConfig)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册