diff --git a/globally_normalized_reader/README.md b/globally_normalized_reader/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a0990367ef8b03c70c29d285e22ef85907e1d0b7 --- /dev/null +++ b/globally_normalized_reader/README.md @@ -0,0 +1 @@ +TBD diff --git a/globally_normalized_reader/config.py b/globally_normalized_reader/config.py index 94ce8ad8c511ee13d86b6097bfb756ccfc0c38ed..5d31eafc2cac879b74d1f69d9dc7855614fd8282 100644 --- a/globally_normalized_reader/config.py +++ b/globally_normalized_reader/config.py @@ -5,7 +5,6 @@ __all__ = ["ModelConfig"] class ModelConfig(object): - beam_size = 3 vocab_size = 104808 embedding_dim = 300 embedding_droprate = 0.3 @@ -15,7 +14,7 @@ class ModelConfig(object): lstm_hidden_droprate = 0.3 passage_indep_embedding_dim = 300 - passage_aligned_embedding_dim = 128 + passage_aligned_embedding_dim = 300 beam_size = 32 @@ -28,6 +27,16 @@ class TrainerConfig(object): data_dir = "data/featurized" save_dir = "models" - batch_size = 12 * 4 + train_batch_size = 4 * 10 + test_batch_size = 1 epochs = 100 + + # for debug print, if set to 0, no information will be printed. + show_parameter_status_period = 0 + checkpoint_period = 100 + log_period = 1 + + # this is used to resume training, this path can set to previously + # trained model. + init_model_path = None diff --git a/globally_normalized_reader/infer.py b/globally_normalized_reader/infer.py new file mode 100755 index 0000000000000000000000000000000000000000..23ad978d47353c55cfff136e70de28ae62989774 --- /dev/null +++ b/globally_normalized_reader/infer.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python +#coding=utf-8 +import os +import sys +import gzip +import logging +import numpy as np +import pdb + +import paddle.v2 as paddle +from paddle.v2.layer import parse_network +import reader + +from model import GNR +from train import choose_samples +from config import ModelConfig, TrainerConfig + +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) + + +def load_reverse_dict(dict_file): + word_dict = {} + with open(dict_file, "r") as fin: + for idx, line in enumerate(fin): + word_dict[idx] = line.strip() + return word_dict + + +def parse_one_sample(raw_input_doc, sub_sen_scores, selected_sentence, + start_span_scores, selected_starts, end_span_scores, + selected_ends): + assert len(raw_input_doc) == sub_sen_scores.shape[0] + beam_size = selected_sentence.shape[1] + + all_searched_ans = [] + for i in xrange(selected_ends.shape[0]): + for j in xrange(selected_ends.shape[1]): + if selected_ends[i][j] == -1.: break + all_searched_ans.append({ + 'score': end_span_scores[int(selected_ends[i][j])], + 'sentence_pos': -1, + 'start_span_pos': -1, + 'end_span_pos': int(selected_ends[i][j]), + 'parent_ids_in_prev_beam': i + }) + + for path in all_searched_ans: + row_id = path['parent_ids_in_prev_beam'] / beam_size + col_id = path['parent_ids_in_prev_beam'] % beam_size + path['start_span_pos'] = int(selected_starts[row_id][col_id]) + path['score'] += start_span_scores[path['start_span_pos']] + path['parent_ids_in_prev_beam'] = row_id + + for path in all_searched_ans: + row_id = path['parent_ids_in_prev_beam'] / beam_size + col_id = path['parent_ids_in_prev_beam'] % beam_size + path['sentence_pos'] = int(selected_sentence[row_id][col_id]) + path['score'] += sub_sen_scores[path['sentence_pos']] + + all_searched_ans.sort(key=lambda x: x['score'], reverse=True) + return all_searched_ans + + +def infer_a_batch(inferer, test_batch, ids_2_word, out_layer_count): + outs = inferer.infer(input=test_batch, flatten_result=False, field="value") + + for test_sample in test_batch: + query_word = [ids_2_word[ids] for ids in test_sample[0]] + print("query\n\t%s\ndocument" % (" ".join(query_word))) + + # iterate over each word of in document + for i, sentence in enumerate(test_sample[1]): + sen_word = [ids_2_word[ids] for ids in sentence] + print("%d\t%s" % (i, " ".join(sen_word))) + print("gold\t[%d %d %d]" % + (test_sample[3], test_sample[4], test_sample[5])) + + ans = parse_one_sample(test_sample[1], *outs)[0] + ans_ids = test_sample[1][ans['sentence_pos']][ans['start_span_pos']:ans[ + 'start_span_pos'] + ans['end_span_pos']] + ans_str = " ".join([ids_2_word[ids] for ids in ans_ids]) + print("searched answer\t[%d %d %d]\n\t%s" % + (ans['sentence_pos'], ans['start_span_pos'], ans['end_span_pos'], + ans_str)) + + +def infer(model_path, data_dir, test_batch_size, config): + assert os.path.exists(model_path), "The model does not exist." + paddle.init(use_gpu=False, trainer_count=1) + + ids_2_word = load_reverse_dict(config.dict_path) + + outputs = GNR(config, is_infer=True) + + # load the trained models + parameters = paddle.parameters.Parameters.from_tar( + gzip.open(model_path, "r")) + inferer = paddle.inference.Inference( + output_layer=outputs, parameters=parameters) + + _, valid_samples = choose_samples(data_dir) + test_reader = reader.data_reader(valid_samples, is_train=False) + + test_batch = [] + for i, item in enumerate(test_reader()): + test_batch.append(item) + if len(test_batch) == test_batch_size: + infer_a_batch(inferer, test_batch, ids_2_word, len(outputs)) + test_batch = [] + + if len(test_batch): + infer_a_batch(inferer, test_batch, ids_2_word, len(outputs)) + test_batch = [] + + +if __name__ == "__main__": + infer("models/pass_00003.tar.gz", TrainerConfig.data_dir, + TrainerConfig.test_batch_size, ModelConfig) diff --git a/globally_normalized_reader/model.py b/globally_normalized_reader/model.py index 9375cb55bbf60afc3c528567c2c5a5d4f3476e2b..3a81c616e94c2e28a1e374be4a9b475f0a4e4b55 100755 --- a/globally_normalized_reader/model.py +++ b/globally_normalized_reader/model.py @@ -113,6 +113,7 @@ def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config, input=doc_lstm_outs, agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE) sentence_scores = paddle.layer.fc(input=last_state_of_sentence, size=1, + bias_attr=False, act=paddle.activation.Linear()) topk_sentence_ids = paddle.layer.kmax_sequence_score( input=sentence_scores, beam_size=config.beam_size) @@ -122,6 +123,7 @@ def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config, # expand beam to search start positions on selected sentences start_pos_scores = paddle.layer.fc(input=topk_sen, size=1, + bias_attr=False, act=paddle.activation.Linear()) topk_start_pos_ids = paddle.layer.kmax_sequence_score( input=start_pos_scores, beam_size=config.beam_size) @@ -137,12 +139,16 @@ def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config, prefix="__end_span_embeddings__") end_pos_scores = paddle.layer.fc(input=end_span_embedding, size=1, + bias_attr=False, act=paddle.activation.Linear()) topk_end_pos_ids = paddle.layer.kmax_sequence_score( input=end_pos_scores, beam_size=config.beam_size) if is_infer: - return [topk_sentence_ids, topk_start_pos_ids, topk_end_pos_ids] + return [ + sentence_scores, topk_sentence_ids, start_pos_scores, + topk_start_pos_ids, end_pos_scores, topk_end_pos_ids + ] else: return paddle.layer.cross_entropy_over_beam(input=[ paddle.layer.BeamInput(sentence_scores, topk_sentence_ids, diff --git a/globally_normalized_reader/reader.py b/globally_normalized_reader/reader.py index bbfd414d5b688e204a7bc4932bf218346c64ac8e..e17f2a7c30c11d74078b630b06c0e50091661027 100755 --- a/globally_normalized_reader/reader.py +++ b/globally_normalized_reader/reader.py @@ -9,7 +9,7 @@ logger = logging.getLogger("paddle") logger.setLevel(logging.INFO) -def train_reader(data_list, is_train=True): +def data_reader(data_list, is_train=True): def reader(): # every pass shuffle the data list again if is_train: @@ -39,6 +39,6 @@ if __name__ == "__main__": from train import choose_samples train_list, dev_list = choose_samples("data/featurized") - for i, item in enumerate(train_reader(train_list)()): + for i, item in enumerate(data_reader(train_list)()): print(item) if i > 5: break diff --git a/globally_normalized_reader/train.py b/globally_normalized_reader/train.py index bb99c27e36cd0c566bbd678bfd91f9676a8df28b..c5d0af23146508661aa751f809e47a809d6cdc62 100755 --- a/globally_normalized_reader/train.py +++ b/globally_normalized_reader/train.py @@ -21,6 +21,14 @@ logger = logging.getLogger("paddle") logger.setLevel(logging.INFO) +def load_initial_model(model_path, parameters): + """ + initalize parameters in the network from a trained model. + """ + with gzip.open(model_path, "rb") as f: + parameters.init_from_tar(f) + + def load_pretrained_parameters(path, height, width): return np.load(path) @@ -35,6 +43,38 @@ def load_initial_model(model_path, parameters): parameters.init_from_tar(f) +def show_parameter_init_info(parameters): + for p in parameters: + logger.info("%s : initial_mean %.4f initial_std %.4f" % + (p, parameters.__param_conf__[p].initial_mean, + parameters.__param_conf__[p].initial_std)) + + +def dump_value_matrix(param_name, dims, value): + np.savetxt( + param_name + ".txt", + value.reshape(dims[0], dims[1]), + fmt="%.4f", + delimiter=",") + + +def show_parameter_status(parameters): + # for debug print + for p in parameters: + + value = parameters.get(p) + grad = parameters.get_grad(p) + + avg_abs_value = np.average(np.abs(value)) + avg_abs_grad = np.average(np.abs(grad)) + + logger.info( + ("%s avg_abs_value=%.6f avg_abs_grad=%.6f " + "min_value=%.6f max_value=%.6f min_grad=%.6f max_grad=%.6f") % + (p, avg_abs_value, avg_abs_grad, value.min(), value.max(), + grad.min(), grad.max())) + + def choose_samples(path): """ Load filenames for train, dev, and augmented samples. @@ -52,7 +92,7 @@ def choose_samples(path): train_samples.sort() valid_samples.sort() - # random.shuffle(train_samples) + random.shuffle(train_samples) return train_samples, valid_samples @@ -65,15 +105,12 @@ def build_reader(data_dir, batch_size): train_reader = paddle.batch( paddle.reader.shuffle( - reader.train_reader(train_samples), buf_size=102400), + reader.data_reader(train_samples), buf_size=102400), batch_size=batch_size) - # train_reader = paddle.batch( - # reader.train_reader(train_samples), batch_size=batch_size) - # testing data is not shuffled test_reader = paddle.batch( - reader.train_reader( + reader.data_reader( valid_samples, is_train=False), batch_size=batch_size) return train_reader, test_reader @@ -87,16 +124,21 @@ def build_event_handler(config, parameters, trainer, test_reader): # End batch and end pass event handler def event_handler(event): """The event handler.""" + if isinstance(event, paddle.event.EndIteration): - if (not event.batch_id % 100) and event.batch_id: + if event.batch_id and \ + (not event.batch_id % config.checkpoint_period): save_path = os.path.join(config.save_dir, "checkpoint_param.latest.tar.gz") save_model(save_path, parameters) - if not event.batch_id % 1: - logger.info( - "Pass %d, Batch %d, Cost %f, %s" % - (event.pass_id, event.batch_id, event.cost, event.metrics)) + if event.batch_id and not event.batch_id % config.log_period: + logger.info("Pass %d, Batch %d, Cost %f" % + (event.pass_id, event.batch_id, event.cost)) + + if config.show_parameter_status_period and event.batch_id and \ + not (event.batch_id % config.show_parameter_status_period): + show_parameter_status(parameters) if isinstance(event, paddle.event.EndPass): save_path = os.path.join(config.save_dir, @@ -119,34 +161,36 @@ def train(model_config, trainer_config): # define the optimizer optimizer = paddle.optimizer.Adam( learning_rate=trainer_config.learning_rate, - regularization=paddle.optimizer.L2Regularization(rate=1e-3), - # model_average=paddle.optimizer.ModelAverage(average_window=0.5)) - ) + regularization=paddle.optimizer.L2Regularization(rate=5e-4), + model_average=paddle.optimizer.ModelAverage(average_window=0.5)) # define network topology loss = GNR(model_config) - # print(parse_network(loss)) - parameters = paddle.parameters.create(loss) - parameters.set("GloveVectors", - load_pretrained_parameters( - ModelConfig.pretrained_emb_path, - height=ModelConfig.vocab_size, - width=ModelConfig.embedding_dim)) + show_parameter_init_info(parameters) + + if trainer_config.init_model_path: + load_initial_model(trainer_config.init_model_path, parameters) + else: + # load the pre-trained embeddings + parameters.set("GloveVectors", + load_pretrained_parameters( + ModelConfig.pretrained_emb_path, + height=ModelConfig.vocab_size, + width=ModelConfig.embedding_dim)) trainer = paddle.trainer.SGD(cost=loss, parameters=parameters, update_equation=optimizer) - # define data reader train_reader, test_reader = build_reader(trainer_config.data_dir, - trainer_config.batch_size) + trainer_config.train_batch_size) event_handler = build_event_handler(trainer_config, parameters, trainer, test_reader) trainer.train( - reader=data_reader, + reader=train_reader, num_passes=trainer_config.epochs, event_handler=event_handler)