From d30a28c76e0c5679cabc6c30ea7fa20065cd16cb Mon Sep 17 00:00:00 2001 From: caoying03 Date: Tue, 25 Jul 2017 09:05:03 +0800 Subject: [PATCH] proj init. --- globally_normalized_reader/.gitignore | 3 + globally_normalized_reader/basic_modules.py | 135 +++++++++++++ globally_normalized_reader/config.py | 25 +++ globally_normalized_reader/model.py | 199 ++++++++++++++++++++ globally_normalized_reader/reader.py | 43 +++++ globally_normalized_reader/train.py | 139 ++++++++++++++ 6 files changed, 544 insertions(+) create mode 100644 globally_normalized_reader/.gitignore create mode 100755 globally_normalized_reader/basic_modules.py create mode 100644 globally_normalized_reader/config.py create mode 100755 globally_normalized_reader/model.py create mode 100755 globally_normalized_reader/reader.py create mode 100755 globally_normalized_reader/train.py diff --git a/globally_normalized_reader/.gitignore b/globally_normalized_reader/.gitignore new file mode 100644 index 00000000..c345f460 --- /dev/null +++ b/globally_normalized_reader/.gitignore @@ -0,0 +1,3 @@ +data +*.txt +*.pyc diff --git a/globally_normalized_reader/basic_modules.py b/globally_normalized_reader/basic_modules.py new file mode 100755 index 00000000..99e1d905 --- /dev/null +++ b/globally_normalized_reader/basic_modules.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +#coding=utf-8 +import pdb +import collections + +import paddle.v2 as paddle +from paddle.v2.layer import parse_network + +__all__ = [ + "stacked_bidirectional_lstm", + "lstm_by_nested_sequence", +] + + +def stacked_bidirectional_lstm(inputs, size, depth, drop_rate=0., prefix=""): + if not isinstance(inputs, collections.Sequence): + inputs = [inputs] + + lstm_last = [] + for dirt in ["fwd", "bwd"]: + for i in range(depth): + input_proj = paddle.layer.mixed( + name="%s_in_proj_%0d_%s__" % (prefix, i, dirt), + size=size * 4, + bias_attr=paddle.attr.Param(initial_std=0.), + input=[paddle.layer.full_matrix_projection(lstm)] if i else [ + paddle.layer.full_matrix_projection(in_layer) + for in_layer in inputs + ]) + lstm = paddle.layer.lstmemory( + input=input_proj, + bias_attr=paddle.attr.Param(initial_std=0.), + param_attr=paddle.attr.Param(initial_std=5e-4), + reverse=(dirt == "bwd")) + lstm_last.append(lstm) + + final_states = paddle.layer.concat(input=[ + paddle.layer.last_seq(input=lstm_last[0]), + paddle.layer.first_seq(input=lstm_last[1]), + ]) + return final_states, paddle.layer.concat( + input=lstm_last, + layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=drop_rate), ) + + +def lstm_by_nested_sequence(input_layer, hidden_dim, name="", reverse=False): + ''' + This is a LSTM implemended by nested recurrent_group. + Paragraph is a nature nested sequence: + 1. each paragraph is a sequence of sentence. + 2. each sentence is a sequence of words. + + This function ueses the nested recurrent_group to implement LSTM. + 1. The outer group iterates over sentence in a paragraph. + 2. The inner group iterates over words in a sentence. + 3. A LSTM is used to encode sentence, its final outputs is used to + initialize memory of the LSTM that is used to encode the next sentence. + 4. Parameters are shared among these sentence-encoding LSTMs. + 5. Consequently, this function is just equivalent to concatenate all + sentences in a paragraph into one (long) sentence, and use one LSTM to + encode this new long sentence. + ''' + + def lstm_outer_step(lstm_group_input, hidden_dim, reverse, name=''): + outer_memory = paddle.layer.memory( + name="__inner_%s_last__" % name, size=hidden_dim) + + def lstm_inner_step(input_layer, hidden_dim, reverse, name): + inner_memory = paddle.layer.memory( + name="__inner_state_%s__" % name, + size=hidden_dim, + boot_layer=outer_memory) + input_proj = paddle.layer.fc( + size=hidden_dim * 4, bias_attr=False, input=input_layer) + return paddle.networks.lstmemory_unit( + input=input_proj, + name="__inner_state_%s__" % name, + out_memory=inner_memory, + size=hidden_dim, + act=paddle.activation.Tanh(), + gate_act=paddle.activation.Sigmoid(), + state_act=paddle.activation.Tanh()) + + inner_out = paddle.layer.recurrent_group( + name="__inner_%s__" % name, + step=lstm_inner_step, + reverse=reverse, + input=[lstm_group_input, hidden_dim, reverse, name]) + + if reverse: + inner_last_output = paddle.layer.first_seq( + input=inner_out, + name="__inner_%s_last__" % name, + agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE) + else: + inner_last_output = paddle.layer.last_seq( + input=inner_out, + name="__inner_%s_last__" % name, + agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE) + return inner_out + + return paddle.layer.recurrent_group( + input=[ + paddle.layer.SubsequenceInput(input_layer), hidden_dim, reverse, + name + ], + step=lstm_outer_step, + name="__outter_%s__" % name, + reverse=reverse) + + +def stacked_bi_lstm_by_nested_seq(input_layer, depth, hidden_dim, prefix=""): + lstm_final_outs = [] + for dirt in ["fwd", "bwd"]: + for i in range(depth): + lstm_out = lstm_by_nested_sequence( + input_layer=(lstm_out if i else input_layer), + hidden_dim=hidden_dim, + name="__%s_%s_%02d__" % (prefix, dirt, i), + reverse=(dirt == "bwd")) + lstm_final_outs.append(lstm_out) + return paddle.layer.concat(input=lstm_final_outs) + + +if __name__ == "__main__": + vocab_size = 1024 + emb_dim = 128 + embedding = paddle.layer.embedding( + input=paddle.layer.data( + name="word", + type=paddle.data_type.integer_value_sub_sequence(vocab_size)), + size=emb_dim) + print(parse_network( + stacked_bi_lstm_by_nested_seq( + input_layer=embedding, depth=3, hidden_dim=128, prefix="__lstm"))) diff --git a/globally_normalized_reader/config.py b/globally_normalized_reader/config.py new file mode 100644 index 00000000..9b0ec4f0 --- /dev/null +++ b/globally_normalized_reader/config.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python +#coding=utf-8 + +__all__ = ["ModelConfig"] + + +class ModelConfig(object): + beam_size = 3 + vocab_size = 102400 + embedding_dim = 256 + embedding_droprate = 0.3 + + lstm_depth = 3 + lstm_hidden_dim = 300 + lstm_hidden_droprate = 0.3 + + passage_indep_embedding_dim = 300 + passage_aligned_embedding_dim = 128 + + beam_size = 5 + + +class TrainerConfig(object): + learning_rate = 1e-3 + data_dir = "data/featurized" diff --git a/globally_normalized_reader/model.py b/globally_normalized_reader/model.py new file mode 100755 index 00000000..21e86b65 --- /dev/null +++ b/globally_normalized_reader/model.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python +#coding=utf-8 +import pdb + +import paddle.v2 as paddle +from paddle.v2.layer import parse_network +import basic_modules +from config import ModelConfig + +__all__ = ["GNR"] + + +def build_pretrained_embedding(name, + data_type, + vocab_size, + emb_dim, + emb_drop=0.): + one_hot_input = paddle.layer.data( + name=name, type=paddle.data_type.integer_value_sequence(vocab_size)) + return paddle.layer.embedding( + input=one_hot_input, + size=emb_dim, + param_attr=paddle.attr.Param( + name="GloveVectors", is_static=True), + layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=emb_drop), ) + + +def encode_question(input_embedding, config, prefix): + lstm_final, lstm_outs = basic_modules.stacked_bidirectional_lstm( + inputs=input_embedding, + size=config.lstm_hidden_dim, + depth=config.lstm_depth, + drop_rate=config.lstm_hidden_droprate, + prefix=prefix) + + # passage-independent embeddings + candidates = paddle.layer.fc(input=lstm_outs, + bias_attr=False, + size=config.passage_indep_embedding_dim, + act=paddle.activation.Linear()) + weights = paddle.layer.fc(input=lstm_outs, + size=1, + act=paddle.activation.SequenceSoftmax()) + weighted_candidates = paddle.layer.scaling(input=candidates, weight=weights) + passage_indep_embedding = paddle.layer.pooling( + input=weighted_candidates, pooling_type=paddle.pooling.Sum()) + return paddle.layer.concat( + input=[lstm_final, passage_indep_embedding]), lstm_outs + + +def question_aligned_passage_embedding(question_lstm_outs, document_embeddings, + config): + def outer_sentence_step(document_embeddings, question_lstm_outs, config): + ''' + in this recurrent_group, document_embeddings has scattered into sequence, + ''' + + def inner_word_step(word_embedding, question_lstm_outs, + question_outs_proj, config): + ''' + in this recurrent_group, sentence embedding has scattered into word + embeddings. + ''' + doc_word_expand = paddle.layer.expand( + input=word_embedding, + expand_as=question_lstm_outs, + expand_level=paddle.layer.ExpandLevel.FROM_NO_SEQUENCE) + + weights = paddle.layer.fc( + input=[question_lstm_outs, doc_word_expand], + size=1, + act=paddle.activation.SequenceSoftmax()) + weighted_candidates = paddle.layer.scaling( + input=question_outs_proj, weight=weights) + return paddle.layer.pooling( + input=weighted_candidates, pooling_type=paddle.pooling.Sum()) + + question_outs_proj = paddle.layer.fc( + input=question_lstm_outs, + bias_attr=False, + size=config.passage_aligned_embedding_dim) + return paddle.layer.recurrent_group( + input=[ + paddle.layer.SubsequenceInput(document_embeddings), + paddle.layer.StaticInput(question_lstm_outs), + paddle.layer.StaticInput(question_outs_proj), + config, + ], + step=inner_word_step, + name="iter_over_word") + + return paddle.layer.recurrent_group( + input=[ + paddle.layer.SubsequenceInput(document_embeddings), + paddle.layer.StaticInput(question_lstm_outs), config + ], + step=outer_sentence_step, + name="iter_over_sen") + + +def encode_documents(input_embedding, same_as_question, question_vector, + question_lstm_outs, config, prefix): + question_expanded = paddle.layer.expand( + input=question_vector, + expand_as=input_embedding, + expand_level=paddle.layer.ExpandLevel.FROM_NO_SEQUENCE) + question_aligned_embedding = question_aligned_passage_embedding( + question_lstm_outs, input_embedding, config) + return paddle.layer.concat(input=[ + input_embedding, question_expanded, same_as_question, + question_aligned_embedding + ]) + + +def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config): + last_state_of_sentence = paddle.layer.last_seq( + input=doc_lstm_outs, agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE) + + # HERE do not use sequence softmax activition. + sentence_scores = paddle.layer.fc(input=last_state_of_sentence, + size=1, + act=paddle.activation.Exp()) + topk_sentence_ids = paddle.layer.kmax_sequence_score( + input=sentence_scores, beam_size=config.beam_size) + topk_sen = paddle.layer.sub_nested_seq( + input=last_state_of_sentence, selected_indices=topk_sentence_ids) + + # expand beam to search start positions on selected sentences + start_pos_scores = paddle.layer.fc(input=topk_sen, + size=1, + act=paddle.activation.Exp()) + topk_start_pos_ids = paddle.layer.kmax_sequence_score( + input=sentence_scores, beam_size=config.beam_size) + topk_start_spans = paddle.layer.seq_slice( + input=topk_sen, starts=topk_start_pos_ids, ends=None) + + # expand beam to search end positions on selected start spans + _, end_span_embedding = basic_modules.stacked_bidirectional_lstm( + inputs=topk_start_spans, + size=config.lstm_hidden_dim, + depth=config.lstm_depth, + drop_rate=config.lstm_hidden_droprate, + prefix="__end_span_embeddings__") + end_pos_scores = paddle.layer.fc(input=end_span_embedding, + size=1, + act=paddle.activation.Exp()) + topk_end_pos_ids = paddle.layer.kmax_sequence_score( + input=end_pos_scores, beam_size=config.beam_size) + cost = paddle.layer.cross_entropy_over_beam( + input=[ + sentence_scores, topk_sentence_ids, start_pos_scores, + topk_start_pos_ids, end_pos_scores, topk_end_pos_ids + ], + label=[sentence_idx, start_idx, end_idx]) + return cost + + +def GNR(config): + # encoding question words + question_embeddings = build_pretrained_embedding( + "question", paddle.data_type.integer_value_sequence, config.vocab_size, + config.embedding_dim, config.embedding_droprate) + question_vector, question_lstm_outs = encode_question( + input_embedding=question_embeddings, config=config, prefix="__ques") + + # encoding document words + document_embeddings = build_pretrained_embedding( + "documents", paddle.data_type.integer_value_sub_sequence, + config.vocab_size, config.embedding_dim, config.embedding_droprate) + same_as_question = paddle.layer.data( + name="same_as_question", + type=paddle.data_type.integer_value_sub_sequence(2)) + document_words_ecoding = encode_documents( + input_embedding=document_embeddings, + question_vector=question_vector, + question_lstm_outs=question_lstm_outs, + same_as_question=same_as_question, + config=config, + prefix="__doc") + + doc_lstm_outs = basic_modules.stacked_bi_lstm_by_nested_seq( + input_layer=document_words_ecoding, + hidden_dim=config.lstm_hidden_dim, + depth=config.lstm_depth, + prefix="__doc_lstm") + + # define labels + sentence_idx = paddle.layer.data( + name="sen_idx", type=paddle.data_type.integer_value(1)) + start_idx = paddle.layer.data( + name="start_idx", type=paddle.data_type.integer_value(1)) + end_idx = paddle.layer.data( + name="end_idx", type=paddle.data_type.integer_value(1)) + return search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, + config) + + +if __name__ == "__main__": + print(parse_network(GNR(ModelConfig))) diff --git a/globally_normalized_reader/reader.py b/globally_normalized_reader/reader.py new file mode 100755 index 00000000..20e7128b --- /dev/null +++ b/globally_normalized_reader/reader.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +#coding=utf-8 +import pdb +import os +import random +import json + + +def train_reader(data_list, is_train=True): + def reader(): + # every pass shuffle the data list again + if is_train: + random.shuffle(data_list) + + for train_sample in data_list: + data = json.load(open(train_sample, "r")) + sent_len = data['sent_lengths'] + + doc_len = len(data['context']) + same_as_question_word = [[[x]] + for x in data['same_as_question_word']] + + ans_sentence = [0] * doc_len + ans_sentence[data['ans_sentence']] = 1 + + ans_start = [0] * doc_len + ans_start[data['ans_start']] = 1 + + ans_end = [0] * doc_len + ans_end[data['ans_end']] = 1 + yield (data['question'], data['context'], same_as_question_word, + ans_sentence, ans_start, ans_end) + + return reader + + +if __name__ == "__main__": + from train import choose_samples + + train_list, dev_list = choose_samples("data/featurized") + for i, item in enumerate(train_reader(train_list)()): + print(item) + if i > 5: break diff --git a/globally_normalized_reader/train.py b/globally_normalized_reader/train.py new file mode 100755 index 00000000..8ef4bab1 --- /dev/null +++ b/globally_normalized_reader/train.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python +#coding=utf-8 +from __future__ import print_function + +import pdb +import os +import sys +import logging +import random +import glob +import gzip + +import reader +import paddle.v2 as paddle +from paddle.v2.layer import parse_network +from model import GNR +from config import ModelConfig, TrainerConfig + +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) + + +def load_pretrained_parameters(path, height, width): + return + + +def save_model(save_path, parameters): + with gzip.open(save_path, "w") as f: + parameters.to_tar(f) + + +def load_initial_model(model_path, parameters): + with gzip.open(model_path, "rb") as f: + parameters.init_from_tar(f) + + +def choose_samples(path): + """ + Load filenames for train, dev, and augmented samples. + """ + if not os.path.exists(os.path.join(path, "train")): + print( + "Non-existent directory as input path: {}".format(path), + file=sys.stderr) + sys.exit(1) + + # Get paths to all samples that we want to load. + train_samples = glob.glob(os.path.join(path, "train", "*")) + valid_samples = glob.glob(os.path.join(path, "dev", "*")) + + train_samples.sort() + valid_samples.sort() + + random.shuffle(train_samples) + + return train_samples, valid_samples + + +def build_reader(data_dir): + """ + Build the data reader for this model. + """ + train_samples, valid_samples = choose_samples(data_dir) + pdb.set_trace() + + train_reader = paddle.batch( + paddle.reader.shuffle( + reader.train_reader(train_samples), buf_size=102400), + batch_size=config.batch_size) + + # testing data is not shuffled + test_reader = paddle.batch( + reader.train_reader(valid_samples, is_train=False), + batch_size=config.batch_size) + return train_reader, test_reader + + +def build_event_handler(config, parameters, trainer, test_reader): + """ + Build the event handler for this model. + """ + + # End batch and end pass event handler + def event_handler(event): + """The event handler.""" + if isinstance(event, paddle.event.EndIteration): + if (not event.batch_id % 100) and event.batch_id: + save_model("checkpoint_param.latest.tar.gz", parameters) + + if not event.batch_id % 5: + logger.info( + "Pass %d, Batch %d, Cost %f, %s" % + (event.pass_id, event.batch_id, event.cost, event.metrics)) + + if isinstance(event, paddle.event.EndPass): + save_model(config.param_save_filename_format % event.pass_id, + parameters) + with gzip.open(param_path, 'w') as handle: + parameters.to_tar(handle) + + result = trainer.test(reader=test_reader) + logger.info("Test with Pass %d, %s" % + (event.pass_id, result.metrics)) + + return event_handler + + +def train(model_config, trainer_config): + paddle.init(use_gpu=True, trainer_count=1) + + # define the optimizer + optimizer = paddle.optimizer.Adam( + learning_rate=trainer_config.learning_rate, + regularization=paddle.optimizer.L2Regularization(rate=1e-3), + model_average=paddle.optimizer.ModelAverage(average_window=0.5)) + + # define network topology + losses = GNR(model_config) + parameters = paddle.parameters.create(losses) + # print(parse_network(losses)) + trainer = paddle.trainer.SGD( + cost=losses, parameters=parameters, update_equation=optimizer) + """ + parameters.set('GloveVectors', + load_pretrained_parameters(parameter_path, height, width)) + """ + + # define data reader + train_reader, test_reader = build_reader(trainer_config.data_dir) + + event_handler = build_event_handler(conf, parameters, trainer, test_reader) + trainer.train( + reader=train_reader, + num_passes=conf.epochs, + event_handler=event_handler) + + +if __name__ == "__main__": + train(ModelConfig, TrainerConfig) -- GitLab