提交 f0a11911 编写于 作者: C caoying03

add infer script.

上级 ee1262d5
......@@ -5,7 +5,6 @@ __all__ = ["ModelConfig"]
class ModelConfig(object):
beam_size = 3
vocab_size = 104808
embedding_dim = 300
embedding_droprate = 0.3
......@@ -15,7 +14,7 @@ class ModelConfig(object):
lstm_hidden_droprate = 0.3
passage_indep_embedding_dim = 300
passage_aligned_embedding_dim = 128
passage_aligned_embedding_dim = 300
beam_size = 32
......@@ -28,6 +27,16 @@ class TrainerConfig(object):
data_dir = "data/featurized"
save_dir = "models"
batch_size = 12 * 4
train_batch_size = 4 * 10
test_batch_size = 1
epochs = 100
# for debug print, if set to 0, no information will be printed.
show_parameter_status_period = 0
checkpoint_period = 100
log_period = 1
# this is used to resume training, this path can set to previously
# trained model.
init_model_path = None
#!/usr/bin/env python
#coding=utf-8
import os
import sys
import gzip
import logging
import numpy as np
import pdb
import paddle.v2 as paddle
from paddle.v2.layer import parse_network
import reader
from model import GNR
from train import choose_samples
from config import ModelConfig, TrainerConfig
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
def load_reverse_dict(dict_file):
word_dict = {}
with open(dict_file, "r") as fin:
for idx, line in enumerate(fin):
word_dict[idx] = line.strip()
return word_dict
def parse_one_sample(raw_input_doc, sub_sen_scores, selected_sentence,
start_span_scores, selected_starts, end_span_scores,
selected_ends):
assert len(raw_input_doc) == sub_sen_scores.shape[0]
beam_size = selected_sentence.shape[1]
all_searched_ans = []
for i in xrange(selected_ends.shape[0]):
for j in xrange(selected_ends.shape[1]):
if selected_ends[i][j] == -1.: break
all_searched_ans.append({
'score': end_span_scores[int(selected_ends[i][j])],
'sentence_pos': -1,
'start_span_pos': -1,
'end_span_pos': int(selected_ends[i][j]),
'parent_ids_in_prev_beam': i
})
for path in all_searched_ans:
row_id = path['parent_ids_in_prev_beam'] / beam_size
col_id = path['parent_ids_in_prev_beam'] % beam_size
path['start_span_pos'] = int(selected_starts[row_id][col_id])
path['score'] += start_span_scores[path['start_span_pos']]
path['parent_ids_in_prev_beam'] = row_id
for path in all_searched_ans:
row_id = path['parent_ids_in_prev_beam'] / beam_size
col_id = path['parent_ids_in_prev_beam'] % beam_size
path['sentence_pos'] = int(selected_sentence[row_id][col_id])
path['score'] += sub_sen_scores[path['sentence_pos']]
all_searched_ans.sort(key=lambda x: x['score'], reverse=True)
return all_searched_ans
def infer_a_batch(inferer, test_batch, ids_2_word, out_layer_count):
outs = inferer.infer(input=test_batch, flatten_result=False, field="value")
for test_sample in test_batch:
query_word = [ids_2_word[ids] for ids in test_sample[0]]
print("query\n\t%s\ndocument" % (" ".join(query_word)))
# iterate over each word of in document
for i, sentence in enumerate(test_sample[1]):
sen_word = [ids_2_word[ids] for ids in sentence]
print("%d\t%s" % (i, " ".join(sen_word)))
print("gold\t[%d %d %d]" %
(test_sample[3], test_sample[4], test_sample[5]))
ans = parse_one_sample(test_sample[1], *outs)[0]
ans_ids = test_sample[1][ans['sentence_pos']][ans['start_span_pos']:ans[
'start_span_pos'] + ans['end_span_pos']]
ans_str = " ".join([ids_2_word[ids] for ids in ans_ids])
print("searched answer\t[%d %d %d]\n\t%s" %
(ans['sentence_pos'], ans['start_span_pos'], ans['end_span_pos'],
ans_str))
def infer(model_path, data_dir, test_batch_size, config):
assert os.path.exists(model_path), "The model does not exist."
paddle.init(use_gpu=False, trainer_count=1)
ids_2_word = load_reverse_dict(config.dict_path)
outputs = GNR(config, is_infer=True)
# load the trained models
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(model_path, "r"))
inferer = paddle.inference.Inference(
output_layer=outputs, parameters=parameters)
_, valid_samples = choose_samples(data_dir)
test_reader = reader.data_reader(valid_samples, is_train=False)
test_batch = []
for i, item in enumerate(test_reader()):
test_batch.append(item)
if len(test_batch) == test_batch_size:
infer_a_batch(inferer, test_batch, ids_2_word, len(outputs))
test_batch = []
if len(test_batch):
infer_a_batch(inferer, test_batch, ids_2_word, len(outputs))
test_batch = []
if __name__ == "__main__":
infer("models/pass_00003.tar.gz", TrainerConfig.data_dir,
TrainerConfig.test_batch_size, ModelConfig)
......@@ -113,6 +113,7 @@ def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config,
input=doc_lstm_outs, agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE)
sentence_scores = paddle.layer.fc(input=last_state_of_sentence,
size=1,
bias_attr=False,
act=paddle.activation.Linear())
topk_sentence_ids = paddle.layer.kmax_sequence_score(
input=sentence_scores, beam_size=config.beam_size)
......@@ -122,6 +123,7 @@ def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config,
# expand beam to search start positions on selected sentences
start_pos_scores = paddle.layer.fc(input=topk_sen,
size=1,
bias_attr=False,
act=paddle.activation.Linear())
topk_start_pos_ids = paddle.layer.kmax_sequence_score(
input=start_pos_scores, beam_size=config.beam_size)
......@@ -137,12 +139,16 @@ def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config,
prefix="__end_span_embeddings__")
end_pos_scores = paddle.layer.fc(input=end_span_embedding,
size=1,
bias_attr=False,
act=paddle.activation.Linear())
topk_end_pos_ids = paddle.layer.kmax_sequence_score(
input=end_pos_scores, beam_size=config.beam_size)
if is_infer:
return [topk_sentence_ids, topk_start_pos_ids, topk_end_pos_ids]
return [
sentence_scores, topk_sentence_ids, start_pos_scores,
topk_start_pos_ids, end_pos_scores, topk_end_pos_ids
]
else:
return paddle.layer.cross_entropy_over_beam(input=[
paddle.layer.BeamInput(sentence_scores, topk_sentence_ids,
......
......@@ -9,7 +9,7 @@ logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
def train_reader(data_list, is_train=True):
def data_reader(data_list, is_train=True):
def reader():
# every pass shuffle the data list again
if is_train:
......@@ -39,6 +39,6 @@ if __name__ == "__main__":
from train import choose_samples
train_list, dev_list = choose_samples("data/featurized")
for i, item in enumerate(train_reader(train_list)()):
for i, item in enumerate(data_reader(train_list)()):
print(item)
if i > 5: break
......@@ -21,6 +21,14 @@ logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
def load_initial_model(model_path, parameters):
"""
initalize parameters in the network from a trained model.
"""
with gzip.open(model_path, "rb") as f:
parameters.init_from_tar(f)
def load_pretrained_parameters(path, height, width):
return np.load(path)
......@@ -35,6 +43,38 @@ def load_initial_model(model_path, parameters):
parameters.init_from_tar(f)
def show_parameter_init_info(parameters):
for p in parameters:
logger.info("%s : initial_mean %.4f initial_std %.4f" %
(p, parameters.__param_conf__[p].initial_mean,
parameters.__param_conf__[p].initial_std))
def dump_value_matrix(param_name, dims, value):
np.savetxt(
param_name + ".txt",
value.reshape(dims[0], dims[1]),
fmt="%.4f",
delimiter=",")
def show_parameter_status(parameters):
# for debug print
for p in parameters:
value = parameters.get(p)
grad = parameters.get_grad(p)
avg_abs_value = np.average(np.abs(value))
avg_abs_grad = np.average(np.abs(grad))
logger.info(
("%s avg_abs_value=%.6f avg_abs_grad=%.6f "
"min_value=%.6f max_value=%.6f min_grad=%.6f max_grad=%.6f") %
(p, avg_abs_value, avg_abs_grad, value.min(), value.max(),
grad.min(), grad.max()))
def choose_samples(path):
"""
Load filenames for train, dev, and augmented samples.
......@@ -52,7 +92,7 @@ def choose_samples(path):
train_samples.sort()
valid_samples.sort()
# random.shuffle(train_samples)
random.shuffle(train_samples)
return train_samples, valid_samples
......@@ -65,15 +105,12 @@ def build_reader(data_dir, batch_size):
train_reader = paddle.batch(
paddle.reader.shuffle(
reader.train_reader(train_samples), buf_size=102400),
reader.data_reader(train_samples), buf_size=102400),
batch_size=batch_size)
# train_reader = paddle.batch(
# reader.train_reader(train_samples), batch_size=batch_size)
# testing data is not shuffled
test_reader = paddle.batch(
reader.train_reader(
reader.data_reader(
valid_samples, is_train=False),
batch_size=batch_size)
return train_reader, test_reader
......@@ -87,16 +124,21 @@ def build_event_handler(config, parameters, trainer, test_reader):
# End batch and end pass event handler
def event_handler(event):
"""The event handler."""
if isinstance(event, paddle.event.EndIteration):
if (not event.batch_id % 100) and event.batch_id:
if event.batch_id and \
(not event.batch_id % config.checkpoint_period):
save_path = os.path.join(config.save_dir,
"checkpoint_param.latest.tar.gz")
save_model(save_path, parameters)
if not event.batch_id % 1:
logger.info(
"Pass %d, Batch %d, Cost %f, %s" %
(event.pass_id, event.batch_id, event.cost, event.metrics))
if event.batch_id and not event.batch_id % config.log_period:
logger.info("Pass %d, Batch %d, Cost %f" %
(event.pass_id, event.batch_id, event.cost))
if config.show_parameter_status_period and event.batch_id and \
not (event.batch_id % config.show_parameter_status_period):
show_parameter_status(parameters)
if isinstance(event, paddle.event.EndPass):
save_path = os.path.join(config.save_dir,
......@@ -119,34 +161,36 @@ def train(model_config, trainer_config):
# define the optimizer
optimizer = paddle.optimizer.Adam(
learning_rate=trainer_config.learning_rate,
regularization=paddle.optimizer.L2Regularization(rate=1e-3),
# model_average=paddle.optimizer.ModelAverage(average_window=0.5))
)
regularization=paddle.optimizer.L2Regularization(rate=5e-4),
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
# define network topology
loss = GNR(model_config)
# print(parse_network(loss))
parameters = paddle.parameters.create(loss)
parameters.set("GloveVectors",
load_pretrained_parameters(
ModelConfig.pretrained_emb_path,
height=ModelConfig.vocab_size,
width=ModelConfig.embedding_dim))
show_parameter_init_info(parameters)
if trainer_config.init_model_path:
load_initial_model(trainer_config.init_model_path, parameters)
else:
# load the pre-trained embeddings
parameters.set("GloveVectors",
load_pretrained_parameters(
ModelConfig.pretrained_emb_path,
height=ModelConfig.vocab_size,
width=ModelConfig.embedding_dim))
trainer = paddle.trainer.SGD(cost=loss,
parameters=parameters,
update_equation=optimizer)
# define data reader
train_reader, test_reader = build_reader(trainer_config.data_dir,
trainer_config.batch_size)
trainer_config.train_batch_size)
event_handler = build_event_handler(trainer_config, parameters, trainer,
test_reader)
trainer.train(
reader=data_reader,
reader=train_reader,
num_passes=trainer_config.epochs,
event_handler=event_handler)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册