训练序列标注模型用于预测新数据,发现预测结果都是空
Created by: April1010
模型配置如下,基本和book中的07是一样的。不一样的只有1.数据特征(增加了一些其他特征),2.word_embedding没有加载预先的配置。
def train(data_reader, args):
paddle.init(use_gpu=False, trainer_count=args.trainer_count)
# define network topology
feature_out = db_lstm(
stag_dict_size=data_reader.get_dict_size('stag_dict'),
predicate_dict_size=data_reader.get_dict_size('predicate_dict'),
word_dict_size=data_reader.get_dict_size('word_dict'),
label_dict_size=data_reader.get_dict_size('label_dict'),
mix_hidden_lr=args.mix_hidden_lr,
default_std=args.default_std,
hidden_dim=args.hidden_dim,
word_dim=args.word_dim,
mark_dim=args.mark_dim,
stag_dim=args.stag_dim,
num_lstm_layers=args.num_lstm_layers)
target = paddle.layer.data(name='target', type=d_type(data_reader.get_dict_size('label_dict')))
crf_cost = paddle.layer.crf(
size=data_reader.get_dict_size('label_dict'),
input=feature_out,
label=target,
param_attr=paddle.attr.Param(
name='crfw', initial_std=args.default_std, learning_rate=args.mix_hidden_lr))
crf_dec = paddle.layer.crf_decoding(
size=data_reader.get_dict_size('label_dict'),
input=feature_out,
label=target,
param_attr=paddle.attr.Param(name='crfw'))
evaluator.sum(input=crf_dec)
# create parameters
parameters = paddle.parameters.create(crf_cost)
# create optimizer
optimizer = paddle.optimizer.Momentum(
momentum=0,
learning_rate=2e-2,
regularization=paddle.optimizer.L2Regularization(rate=8e-4),
model_average=paddle.optimizer.ModelAverage(
average_window=0.5, max_average_window=10000), )
is_local_flag=True if args.is_local > 0 else False
train_batch_reader = paddle.batch(\
paddle.reader.shuffle(data_reader.get_train_reader(is_local_flag), buf_size=8192), \
batch_size=args.batch_size)
test_batch_reader=paddle.batch(data_reader.get_test_reader(is_local_flag), batch_size=args.batch_size)
feeding=data_reader.data_name_feeding()
trainer = paddle.trainer.SGD(
cost=crf_cost,
parameters=parameters,
update_equation=optimizer,
extra_layers=crf_dec)
def event_handler(event):
global start_time, cost_sum, cost_counter
if isinstance(event, paddle.event.EndIteration):
cost_sum += event.cost
cost_counter += 1
if event.batch_id % 500 == 0:
print "Pass %d, Batch %d, Cost %f" % (
event.pass_id, event.batch_id, cost_sum / cost_counter)
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.BeginPass):
start_time = time.time()
cost_sum, cost_counter = 0.0, 0
if isinstance(event, paddle.event.EndPass):
# save parameters
with gzip.open(args.output_path + '/params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
parameters.to_tar(f)
result = trainer.test(reader=test_batch_reader, feeding=feeding)
print "\n------- PASS END ------- \n Time: %d sec, Pass: %d, ValidationCost: %s" % (
time.time() - start_time, event.pass_id, result.cost)
trainer.train(
reader=train_batch_reader,
event_handler=event_handler,
num_passes=args.num_passes,
feeding=feeding)
def db_lstm(stag_dict_size, predicate_dict_size, word_dict_size,
label_dict_size,
mix_hidden_lr, default_std, hidden_dim, word_dim, mark_dim,
stag_dim, num_lstm_layers):
#7 features
word = paddle.layer.data(name='word_data', type=d_type(word_dict_size))
ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_size))
ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_size))
predicate = paddle.layer.data(name='predicate_data', type=d_type(predicate_dict_size))
word_stag = paddle.layer.data(name='word_stag_data', type=d_type(stag_dict_size))
predicate_stag = paddle.layer.data(name='predicate_stag_data', \
type=d_type(stag_dict_size))
mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_size))
emb_para = paddle.attr.Param(name='emb', initial_std=0.)
std_0 = paddle.attr.Param(initial_std=0.)
std_default = paddle.attr.Param(initial_std=default_std)
predicate_embedding = paddle.layer.embedding(
size=word_dim,
input=predicate,
param_attr=paddle.attr.Param(name='vemb', initial_std=default_std))
mark_embedding = paddle.layer.embedding(
size=mark_dim, input=mark, param_attr=std_0)
word_stag_embedding = paddle.layer.embedding(
size=stag_dim, input=word_stag, param_attr=std_0)
predicate_stag_embedding = paddle.layer.embedding(
size=stag_dim, input=predicate_stag, param_attr=std_0)
word_input = [word, ctx_n1, ctx_p1]
emb_layers = [
paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
for x in word_input
]
emb_layers.append(predicate_embedding)
emb_layers.append(word_stag_embedding)
emb_layers.append(predicate_stag_embedding)
emb_layers.append(mark_embedding)
hidden_0 = paddle.layer.mixed(
size=hidden_dim,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=emb, param_attr=std_default) for emb in emb_layers
])
lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
hidden_para_attr = paddle.attr.Param(
initial_std=default_std, learning_rate=mix_hidden_lr)
lstm_0 = paddle.layer.lstmemory(
input=hidden_0,
act=paddle.activation.Relu(),
gate_act=paddle.activation.Sigmoid(),
state_act=paddle.activation.Sigmoid(),
bias_attr=std_0,
param_attr=lstm_para_attr)
#stack L-LSTM and R-LSTM with direct edges
input_tmp = [hidden_0, lstm_0]
for i in range(1, num_lstm_layers):
mix_hidden = paddle.layer.mixed(
size=hidden_dim,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=input_tmp[0], param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=input_tmp[1], param_attr=lstm_para_attr)
])
lstm = paddle.layer.lstmemory(
input=mix_hidden,
act=paddle.activation.Relu(),
gate_act=paddle.activation.Sigmoid(),
state_act=paddle.activation.Sigmoid(),
reverse=((i % 2) == 1),
bias_attr=std_0,
param_attr=lstm_para_attr)
input_tmp = [mix_hidden, lstm]
feature_out = paddle.layer.mixed(
size=label_dict_size,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=input_tmp[0], param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=input_tmp[1], param_attr=lstm_para_attr)
], )
return feature_out