diff --git a/globally_normalized_reader/basic_modules.py b/globally_normalized_reader/basic_modules.py index 99e1d905f1790c3a868ae75c2a46b590e189cd52..08d5f5056a8d1179c45757d074e22aa3b9aec00f 100755 --- a/globally_normalized_reader/basic_modules.py +++ b/globally_normalized_reader/basic_modules.py @@ -38,9 +38,11 @@ def stacked_bidirectional_lstm(inputs, size, depth, drop_rate=0., prefix=""): paddle.layer.last_seq(input=lstm_last[0]), paddle.layer.first_seq(input=lstm_last[1]), ]) - return final_states, paddle.layer.concat( + + lstm_outs = paddle.layer.concat( input=lstm_last, - layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=drop_rate), ) + layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=drop_rate)) + return final_states, lstm_outs def lstm_by_nested_sequence(input_layer, hidden_dim, name="", reverse=False): @@ -70,8 +72,9 @@ def lstm_by_nested_sequence(input_layer, hidden_dim, name="", reverse=False): name="__inner_state_%s__" % name, size=hidden_dim, boot_layer=outer_memory) - input_proj = paddle.layer.fc( - size=hidden_dim * 4, bias_attr=False, input=input_layer) + input_proj = paddle.layer.fc(size=hidden_dim * 4, + bias_attr=False, + input=input_layer) return paddle.networks.lstmemory_unit( input=input_proj, name="__inner_state_%s__" % name, @@ -91,12 +94,12 @@ def lstm_by_nested_sequence(input_layer, hidden_dim, name="", reverse=False): inner_last_output = paddle.layer.first_seq( input=inner_out, name="__inner_%s_last__" % name, - agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE) + agg_level=paddle.layer.AggregateLevel.TO_NO_SEQUENCE) else: inner_last_output = paddle.layer.last_seq( input=inner_out, name="__inner_%s_last__" % name, - agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE) + agg_level=paddle.layer.AggregateLevel.TO_NO_SEQUENCE) return inner_out return paddle.layer.recurrent_group( diff --git a/globally_normalized_reader/config.py b/globally_normalized_reader/config.py index 9b0ec4f0f5088b64ee6fdad3a3b22c860a966dea..94ce8ad8c511ee13d86b6097bfb756ccfc0c38ed 100644 --- a/globally_normalized_reader/config.py +++ b/globally_normalized_reader/config.py @@ -6,8 +6,8 @@ __all__ = ["ModelConfig"] class ModelConfig(object): beam_size = 3 - vocab_size = 102400 - embedding_dim = 256 + vocab_size = 104808 + embedding_dim = 300 embedding_droprate = 0.3 lstm_depth = 3 @@ -17,9 +17,17 @@ class ModelConfig(object): passage_indep_embedding_dim = 300 passage_aligned_embedding_dim = 128 - beam_size = 5 + beam_size = 32 + + dict_path = "data/featurized/vocab.txt" + pretrained_emb_path = "data/featurized/embeddings.npy" class TrainerConfig(object): learning_rate = 1e-3 data_dir = "data/featurized" + save_dir = "models" + + batch_size = 12 * 4 + + epochs = 100 diff --git a/globally_normalized_reader/model.py b/globally_normalized_reader/model.py index 21e86b65301cd43b4aa69017f7536d1b257b34a0..9375cb55bbf60afc3c528567c2c5a5d4f3476e2b 100755 --- a/globally_normalized_reader/model.py +++ b/globally_normalized_reader/model.py @@ -10,15 +10,10 @@ from config import ModelConfig __all__ = ["GNR"] -def build_pretrained_embedding(name, - data_type, - vocab_size, - emb_dim, - emb_drop=0.): - one_hot_input = paddle.layer.data( - name=name, type=paddle.data_type.integer_value_sequence(vocab_size)) +def build_pretrained_embedding(name, data_type, emb_dim, emb_drop=0.): return paddle.layer.embedding( - input=one_hot_input, + input=paddle.layer.data( + name=name, type=data_type), size=emb_dim, param_attr=paddle.attr.Param( name="GloveVectors", is_static=True), @@ -112,25 +107,24 @@ def encode_documents(input_embedding, same_as_question, question_vector, ]) -def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config): +def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config, + is_infer): last_state_of_sentence = paddle.layer.last_seq( input=doc_lstm_outs, agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE) - - # HERE do not use sequence softmax activition. sentence_scores = paddle.layer.fc(input=last_state_of_sentence, size=1, - act=paddle.activation.Exp()) + act=paddle.activation.Linear()) topk_sentence_ids = paddle.layer.kmax_sequence_score( input=sentence_scores, beam_size=config.beam_size) topk_sen = paddle.layer.sub_nested_seq( - input=last_state_of_sentence, selected_indices=topk_sentence_ids) + input=doc_lstm_outs, selected_indices=topk_sentence_ids) # expand beam to search start positions on selected sentences start_pos_scores = paddle.layer.fc(input=topk_sen, size=1, - act=paddle.activation.Exp()) + act=paddle.activation.Linear()) topk_start_pos_ids = paddle.layer.kmax_sequence_score( - input=sentence_scores, beam_size=config.beam_size) + input=start_pos_scores, beam_size=config.beam_size) topk_start_spans = paddle.layer.seq_slice( input=topk_sen, starts=topk_start_pos_ids, ends=None) @@ -143,33 +137,40 @@ def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config): prefix="__end_span_embeddings__") end_pos_scores = paddle.layer.fc(input=end_span_embedding, size=1, - act=paddle.activation.Exp()) + act=paddle.activation.Linear()) topk_end_pos_ids = paddle.layer.kmax_sequence_score( input=end_pos_scores, beam_size=config.beam_size) - cost = paddle.layer.cross_entropy_over_beam( - input=[ - sentence_scores, topk_sentence_ids, start_pos_scores, - topk_start_pos_ids, end_pos_scores, topk_end_pos_ids - ], - label=[sentence_idx, start_idx, end_idx]) - return cost + if is_infer: + return [topk_sentence_ids, topk_start_pos_ids, topk_end_pos_ids] + else: + return paddle.layer.cross_entropy_over_beam(input=[ + paddle.layer.BeamInput(sentence_scores, topk_sentence_ids, + sentence_idx), + paddle.layer.BeamInput(start_pos_scores, topk_start_pos_ids, + start_idx), + paddle.layer.BeamInput(end_pos_scores, topk_end_pos_ids, end_idx) + ]) -def GNR(config): + +def GNR(config, is_infer=False): # encoding question words question_embeddings = build_pretrained_embedding( - "question", paddle.data_type.integer_value_sequence, config.vocab_size, + "question", + paddle.data_type.integer_value_sequence(config.vocab_size), config.embedding_dim, config.embedding_droprate) question_vector, question_lstm_outs = encode_question( input_embedding=question_embeddings, config=config, prefix="__ques") # encoding document words document_embeddings = build_pretrained_embedding( - "documents", paddle.data_type.integer_value_sub_sequence, - config.vocab_size, config.embedding_dim, config.embedding_droprate) + "documents", + paddle.data_type.integer_value_sub_sequence(config.vocab_size), + config.embedding_dim, config.embedding_droprate) same_as_question = paddle.layer.data( name="same_as_question", - type=paddle.data_type.integer_value_sub_sequence(2)) + type=paddle.data_type.dense_vector_sub_sequence(1)) + document_words_ecoding = encode_documents( input_embedding=document_embeddings, question_vector=question_vector, @@ -192,7 +193,7 @@ def GNR(config): end_idx = paddle.layer.data( name="end_idx", type=paddle.data_type.integer_value(1)) return search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, - config) + config, is_infer) if __name__ == "__main__": diff --git a/globally_normalized_reader/reader.py b/globally_normalized_reader/reader.py index 20e7128b176a1b1a4392f8364ace721abcef6cfa..bbfd414d5b688e204a7bc4932bf218346c64ac8e 100755 --- a/globally_normalized_reader/reader.py +++ b/globally_normalized_reader/reader.py @@ -1,9 +1,12 @@ #!/usr/bin/env python #coding=utf-8 -import pdb import os import random import json +import logging + +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) def train_reader(data_list, is_train=True): @@ -14,22 +17,20 @@ def train_reader(data_list, is_train=True): for train_sample in data_list: data = json.load(open(train_sample, "r")) - sent_len = data['sent_lengths'] - - doc_len = len(data['context']) - same_as_question_word = [[[x]] - for x in data['same_as_question_word']] - - ans_sentence = [0] * doc_len - ans_sentence[data['ans_sentence']] = 1 - - ans_start = [0] * doc_len - ans_start[data['ans_start']] = 1 - ans_end = [0] * doc_len - ans_end[data['ans_end']] = 1 - yield (data['question'], data['context'], same_as_question_word, - ans_sentence, ans_start, ans_end) + start_pos = 0 + doc = [] + same_as_question_word = [] + for l in data['sent_lengths']: + doc.append(data['context'][start_pos:start_pos + l]) + same_as_question_word.append([ + [[x]] for x in data['same_as_question_word'] + ][start_pos:start_pos + l]) + start_pos += l + + yield (data['question'], doc, same_as_question_word, + data['ans_sentence'], data['ans_start'], + data['ans_end'] - data['ans_start']) return reader diff --git a/globally_normalized_reader/train.py b/globally_normalized_reader/train.py index 8ef4bab1047318e272e5bbb8292d67214592a9ee..bb99c27e36cd0c566bbd678bfd91f9676a8df28b 100755 --- a/globally_normalized_reader/train.py +++ b/globally_normalized_reader/train.py @@ -9,6 +9,7 @@ import logging import random import glob import gzip +import numpy as np import reader import paddle.v2 as paddle @@ -21,7 +22,7 @@ logger.setLevel(logging.INFO) def load_pretrained_parameters(path, height, width): - return + return np.load(path) def save_model(save_path, parameters): @@ -51,27 +52,30 @@ def choose_samples(path): train_samples.sort() valid_samples.sort() - random.shuffle(train_samples) + # random.shuffle(train_samples) return train_samples, valid_samples -def build_reader(data_dir): +def build_reader(data_dir, batch_size): """ Build the data reader for this model. """ train_samples, valid_samples = choose_samples(data_dir) - pdb.set_trace() train_reader = paddle.batch( paddle.reader.shuffle( reader.train_reader(train_samples), buf_size=102400), - batch_size=config.batch_size) + batch_size=batch_size) + + # train_reader = paddle.batch( + # reader.train_reader(train_samples), batch_size=batch_size) # testing data is not shuffled test_reader = paddle.batch( - reader.train_reader(valid_samples, is_train=False), - batch_size=config.batch_size) + reader.train_reader( + valid_samples, is_train=False), + batch_size=batch_size) return train_reader, test_reader @@ -85,53 +89,65 @@ def build_event_handler(config, parameters, trainer, test_reader): """The event handler.""" if isinstance(event, paddle.event.EndIteration): if (not event.batch_id % 100) and event.batch_id: - save_model("checkpoint_param.latest.tar.gz", parameters) + save_path = os.path.join(config.save_dir, + "checkpoint_param.latest.tar.gz") + save_model(save_path, parameters) - if not event.batch_id % 5: + if not event.batch_id % 1: logger.info( "Pass %d, Batch %d, Cost %f, %s" % (event.pass_id, event.batch_id, event.cost, event.metrics)) if isinstance(event, paddle.event.EndPass): - save_model(config.param_save_filename_format % event.pass_id, - parameters) - with gzip.open(param_path, 'w') as handle: - parameters.to_tar(handle) + save_path = os.path.join(config.save_dir, + "pass_%05d.tar.gz" % event.pass_id) + save_model(save_path, parameters) - result = trainer.test(reader=test_reader) - logger.info("Test with Pass %d, %s" % - (event.pass_id, result.metrics)) + # result = trainer.test(reader=test_reader) + # logger.info("Test with Pass %d, %s" % + # (event.pass_id, result.metrics)) return event_handler def train(model_config, trainer_config): - paddle.init(use_gpu=True, trainer_count=1) + if not os.path.exists(trainer_config.save_dir): + os.mkdir(trainer_config.save_dir) + + paddle.init(use_gpu=True, trainer_count=4) # define the optimizer optimizer = paddle.optimizer.Adam( learning_rate=trainer_config.learning_rate, regularization=paddle.optimizer.L2Regularization(rate=1e-3), - model_average=paddle.optimizer.ModelAverage(average_window=0.5)) + # model_average=paddle.optimizer.ModelAverage(average_window=0.5)) + ) # define network topology - losses = GNR(model_config) - parameters = paddle.parameters.create(losses) - # print(parse_network(losses)) - trainer = paddle.trainer.SGD( - cost=losses, parameters=parameters, update_equation=optimizer) - """ - parameters.set('GloveVectors', - load_pretrained_parameters(parameter_path, height, width)) - """ + loss = GNR(model_config) + + # print(parse_network(loss)) + + parameters = paddle.parameters.create(loss) + parameters.set("GloveVectors", + load_pretrained_parameters( + ModelConfig.pretrained_emb_path, + height=ModelConfig.vocab_size, + width=ModelConfig.embedding_dim)) + + trainer = paddle.trainer.SGD(cost=loss, + parameters=parameters, + update_equation=optimizer) # define data reader - train_reader, test_reader = build_reader(trainer_config.data_dir) + train_reader, test_reader = build_reader(trainer_config.data_dir, + trainer_config.batch_size) - event_handler = build_event_handler(conf, parameters, trainer, test_reader) + event_handler = build_event_handler(trainer_config, parameters, trainer, + test_reader) trainer.train( - reader=train_reader, - num_passes=conf.epochs, + reader=data_reader, + num_passes=trainer_config.epochs, event_handler=event_handler)