提交 4c990dc2 编写于 作者: C caoying03

Add a trained seq2seq model to generate Chinese poetry.

上级 e72c1056
#!/usr/bin/env python
#coding=utf-8
import os
import sys
import gzip
import logging
import numpy as np
import reader
import paddle.v2 as paddle
from paddle.v2.layer import parse_network
from network_conf import encoder_decoder_network
logger = logging.getLogger("paddle")
logger.setLevel(logging.WARNING)
def infer_a_batch(inferer, test_batch, beam_size, id_to_text, fout):
beam_result = inferer.infer(input=test_batch, field=["prob", "id"])
gen_sen_idx = np.where(beam_result[1] == -1)[0]
assert len(gen_sen_idx) == len(test_batch) * beam_size, ("%d vs. %d" % (
len(gen_sen_idx), len(test_batch) * beam_size))
start_pos, end_pos = 1, 0
for i, sample in enumerate(test_batch):
fout.write("%s\n" % (
" ".join([id_to_text[w] for w in sample[0][1:-1]])
)) # skip the start and ending mark when print the source sentence
for j in xrange(beam_size):
end_pos = gen_sen_idx[i * beam_size + j]
fout.write("%s\n" % ("%.4f\t%s" % (beam_result[0][i][j], " ".join(
id_to_text[w] for w in beam_result[1][start_pos:end_pos]))))
start_pos = end_pos + 2
fout.write("\n")
fout.flush
def generate(model_path, word_dict_path, test_data_path, batch_size, beam_size,
save_file, use_gpu):
assert os.path.exists(model_path), "trained model does not exist."
paddle.init(use_gpu=use_gpu, trainer_count=1)
with gzip.open(model_path, "r") as f:
parameters = paddle.parameters.Parameters.from_tar(f)
id_to_text = {}
with open(word_dict_path, "r") as f:
for i, line in enumerate(f):
id_to_text[i] = line.strip().split("\t")[0]
beam_gen = encoder_decoder_network(
word_count=len(id_to_text),
emb_dim=512,
encoder_depth=3,
encoder_hidden_dim=512,
decoder_depth=3,
decoder_hidden_dim=512,
is_generating=True,
beam_size=beam_size,
max_length=10)
inferer = paddle.inference.Inference(
output_layer=beam_gen, parameters=parameters)
test_batch = []
with open(save_file, "w") as fout:
for idx, item in enumerate(
reader.gen_reader(test_data_path, word_dict_path)()):
test_batch.append([item])
if len(test_batch) == batch_size:
infer_a_batch(inferer, test_batch, beam_size, id_to_text, fout)
test_batch = []
if len(test_batch):
infer_a_batch(inferer, test_batch, beam_size, id_to_text, fout)
test_batch = []
if __name__ == "__main__":
generate(
model_path="models/pass_00025.tar.gz",
word_dict_path="data/word_dict.txt",
test_data_path="data/input.txt",
save_file="gen_result.txt",
batch_size=4,
beam_size=5,
use_gpu=True)
#!/usr/bin/env python
#coding=utf-8
import paddle.v2 as paddle
from paddle.v2.layer import parse_network
__all__ = ["encoder_decoder_network"]
def _bidirect_lstm_encoder(input, hidden_dim, depth):
lstm_last = []
for dirt in ["fwd", "bwd"]:
for i in range(depth):
input_proj = paddle.layer.mixed(
name="__in_proj_%0d_%s__" % (i, dirt),
size=hidden_dim * 4,
bias_attr=True,
input=[
paddle.layer.full_matrix_projection(input_proj),
paddle.layer.full_matrix_projection(
lstm, param_attr=paddle.attr.Param(initial_std=5e-4)),
] if i else [paddle.layer.full_matrix_projection(input)])
lstm = paddle.layer.lstmemory(
input=input_proj,
bias_attr=paddle.attr.Param(initial_std=0.),
param_attr=paddle.attr.Param(initial_std=5e-4),
reverse=i % 2 if dirt == "fwd" else not i % 2)
lstm_last.append(lstm)
return paddle.layer.concat(input=lstm_last)
def _attended_decoder_step(word_count, enc_out, enc_out_proj,
decoder_hidden_dim, depth, trg_emb):
decoder_memory = paddle.layer.memory(
name="__decoder_0__", size=decoder_hidden_dim, boot_layer=None)
context = paddle.networks.simple_attention(
encoded_sequence=enc_out,
encoded_proj=enc_out_proj,
decoder_state=decoder_memory)
for i in range(depth):
input_proj = paddle.layer.mixed(
act=paddle.activation.Linear(),
size=decoder_hidden_dim * 4,
bias_attr=False,
input=[
paddle.layer.full_matrix_projection(input_proj),
paddle.layer.full_matrix_projection(lstm)
] if i else [
paddle.layer.full_matrix_projection(context),
paddle.layer.full_matrix_projection(trg_emb)
])
lstm = paddle.networks.lstmemory_unit(
input=input_proj,
input_proj_layer_attr=paddle.attr.ExtraLayerAttribute(
error_clipping_threshold=25.),
out_memory=decoder_memory if not i else None,
name="__decoder_%d__" % (i),
size=decoder_hidden_dim,
act=paddle.activation.Tanh(),
gate_act=paddle.activation.Sigmoid(),
state_act=paddle.activation.Tanh())
next_word = paddle.layer.fc(
size=word_count,
bias_attr=True,
act=paddle.activation.Softmax(),
input=lstm)
return next_word
def encoder_decoder_network(word_count,
emb_dim,
encoder_depth,
encoder_hidden_dim,
decoder_depth,
decoder_hidden_dim,
beam_size=10,
max_length=15,
is_generating=False):
src_emb = paddle.layer.embedding(
input=paddle.layer.data(
name="src_word_id",
type=paddle.data_type.integer_value_sequence(word_count)),
size=emb_dim,
param_attr=paddle.attr.ParamAttr(name="__embedding__"))
enc_out = _bidirect_lstm_encoder(
input=src_emb, hidden_dim=encoder_hidden_dim, depth=encoder_depth)
enc_out_proj = paddle.layer.fc(
act=paddle.activation.Linear(),
size=encoder_hidden_dim,
bias_attr=False,
input=enc_out)
decoder_group_name = "decoder_group"
group_inputs = [
word_count, paddle.layer.StaticInput(input=enc_out),
paddle.layer.StaticInput(input=enc_out_proj), decoder_hidden_dim,
decoder_depth
]
if is_generating:
gen_trg_emb = paddle.layer.GeneratedInput(
size=word_count,
embedding_name="__embedding__",
embedding_size=emb_dim)
return paddle.layer.beam_search(
name=decoder_group_name,
step=_attended_decoder_step,
input=group_inputs + [gen_trg_emb],
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=max_length)
else:
trg_emb = paddle.layer.embedding(
input=paddle.layer.data(
name="trg_word_id",
type=paddle.data_type.integer_value_sequence(word_count)),
size=emb_dim,
param_attr=paddle.attr.ParamAttr(name="__embedding__"))
lbl = paddle.layer.data(
name="trg_next_word",
type=paddle.data_type.integer_value_sequence(word_count))
next_word = paddle.layer.recurrent_group(
name=decoder_group_name,
step=_attended_decoder_step,
input=group_inputs + [trg_emb])
return paddle.layer.classification_cost(input=next_word, label=lbl)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from utils import load_dict
def train_reader(data_file_path, word_dict_file):
def reader():
word_dict = load_dict(word_dict_file)
unk_id = word_dict[u"<unk>"]
bos_id = word_dict[u"<s>"]
eos_id = word_dict[u"<e>"]
with open(data_file_path, "r") as f:
for line in f:
line_split = line.strip().decode(
"utf8", errors="ignore").split("\t")
if len(line_split) < 3: continue
poetry = line_split[2].split(".")
poetry_ids = []
for sen in poetry:
if sen:
poetry_ids.append([bos_id] + [
word_dict.get(word, unk_id)
for word in "".join(sen.split())
] + [eos_id])
l = len(poetry_ids)
if l < 2: continue
for i in range(l - 1):
yield poetry_ids[i], poetry_ids[i +
1][:-1], poetry_ids[i +
1][1:]
return reader
def gen_reader(data_file_path, word_dict_file):
def reader():
word_dict = load_dict(word_dict_file)
unk_id = word_dict[u"<unk>"]
bos_id = word_dict[u"<s>"]
eos_id = word_dict[u"<e>"]
with open(data_file_path, "r") as f:
for line in f:
input_line = "".join(
line.strip().decode("utf8", errors="ignore").split())
yield [bos_id
] + [word_dict.get(word, unk_id)
for word in input_line] + [eos_id]
return reader
#!/usr/bin/env python
#coding=utf-8
import gzip
import os
import logging
import paddle.v2 as paddle
import reader
from paddle.v2.layer import parse_network
from network_conf import encoder_decoder_network
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
def save_model(save_path, parameters):
with gzip.open(save_path, "w") as f:
parameters.to_tar(f)
def load_initial_model(model_path, parameters):
with gzip.open(model_path, "rb") as f:
parameters.init_from_tar(f)
def main(num_passes,
batch_size,
use_gpu,
trainer_count,
save_dir_path,
encoder_depth,
decoder_depth,
word_dict_path,
train_data_path,
init_model_path=""):
if not os.path.exists(save_dir_path):
os.mkdir(save_dir_path)
# initialize PaddlePaddle
paddle.init(use_gpu=use_gpu, trainer_count=trainer_count, parallel_nn=1)
# define optimization method and the trainer instance
# optimizer = paddle.optimizer.Adam(
optimizer = paddle.optimizer.AdaDelta(
learning_rate=1e-3,
gradient_clipping_threshold=25.0,
regularization=paddle.optimizer.L2Regularization(rate=8e-4),
model_average=paddle.optimizer.ModelAverage(
average_window=0.5, max_average_window=2500))
cost = encoder_decoder_network(
word_count=len(open(word_dict_path, "r").readlines()),
emb_dim=512,
encoder_depth=encoder_depth,
encoder_hidden_dim=512,
decoder_depth=decoder_depth,
decoder_hidden_dim=512)
parameters = paddle.parameters.create(cost)
if init_model_path:
load_initial_model(init_model_path, parameters)
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
# define data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
reader.train_reader(train_data_path, word_dict_path),
buf_size=1024000),
batch_size=batch_size)
# define the event_handler callback
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if (not event.batch_id % 2000) and event.batch_id:
save_path = os.path.join(save_dir_path,
"pass_%05d_batch_%05d.tar.gz" %
(event.pass_id, event.batch_id))
save_model(save_path, parameters)
if not event.batch_id % 5:
logger.info("Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics))
if isinstance(event, paddle.event.EndPass):
save_path = os.path.join(save_dir_path,
"pass_%05d.tar.gz" % event.pass_id)
save_model(save_path, parameters)
# start training
trainer.train(
reader=train_reader, event_handler=event_handler, num_passes=num_passes)
if __name__ == '__main__':
main(
num_passes=500,
batch_size=4 * 500,
use_gpu=True,
trainer_count=4,
encoder_depth=3,
decoder_depth=3,
save_dir_path="models",
word_dict_path="data/word_dict.txt",
train_data_path="data/song.poet.txt",
init_model_path="")
#!/usr/bin/env python
#coding=utf-8
import os
import sys
import re
from collections import defaultdict
def load_dict(word_dict_file):
word_dict = {}
with open(word_dict_file, "r") as fin:
for i, line in enumerate(fin):
key = line.strip().decode("utf8", errors="ignore").split("\t")[0]
word_dict[key] = i
return word_dict
def find_optiaml_pass(log_file):
cost_info = defaultdict(list)
cost_pat = re.compile(r'Cost\s[\d]+.[\d]+')
pass_pat = re.compile(r'Pass\s[\d]+')
with open(log_file, 'r') as flog:
for line in flog:
if not 'Cost' in line: continue
pass_id = pass_pat.findall(line.strip())[0]
cost = float(cost_pat.findall(line.strip())[0].replace('Cost ', ''))
cost_info[pass_id].append(cost)
print("optimal pass : %s" % sorted(
cost_info.iteritems(),
key=lambda x: sum(x[1]) / (len(x[1])),
reverse=False)[0][0])
if __name__ == '__main__':
find_optiaml_pass('trained_models/models_first_round/train.log')
#!/usr/bin/env python
# coding=utf-8
import os
import math
import numpy as np
......
#!/usr/bin/env python
# coding=utf-8
import os
################## for building word dictionary ##################
......
# coding=utf-8
import os
import gzip
import numpy as np
......
# coding=utf-8
import collections
import os
......
#!/usr/bin/env python
# coding=utf-8
import os
import sys
import gzip
import pdb
import paddle.v2 as paddle
import config as conf
......
#!/usr/bin/env python
# coding=utf-8
import os
import logging
from collections import defaultdict
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册