未验证 提交 1edc8317 编写于 作者: C Cao Ying 提交者: GitHub

Merge pull request #431 from ranqiu92/convseq2seq

Add the conv_seq_to_seq model (first version).
# Convolutional Sequence to Sequence Learning
This model implements the work in the following paper:
Jonas Gehring, Micheal Auli, David Grangier, et al. Convolutional Sequence to Sequence Learning. Association for Computational Linguistics (ACL), 2017
# Training a Model
- Modify the following script if needed and then run:
```bash
python train.py \
--train_data_path ./data/train_data \
--test_data_path ./data/test_data \
--src_dict_path ./data/src_dict \
--trg_dict_path ./data/trg_dict \
--enc_blocks "[(256, 3)] * 5" \
--dec_blocks "[(256, 3)] * 3" \
--emb_size 256 \
--pos_size 200 \
--drop_rate 0.1 \
--use_gpu False \
--trainer_count 1 \
--batch_size 32 \
--num_passes 20 \
>train.log 2>&1
```
# Inferring by a Trained Model
- Infer by a trained model by running:
```bash
python infer.py \
--infer_data_path ./data/infer_data \
--src_dict_path ./data/src_dict \
--trg_dict_path ./data/trg_dict \
--enc_blocks "[(256, 3)] * 5" \
--dec_blocks "[(256, 3)] * 3" \
--emb_size 256 \
--pos_size 200 \
--drop_rate 0.1 \
--use_gpu False \
--trainer_count 1 \
--max_len 100 \
--beam_size 1 \
--model_path ./params.pass-0.tar.gz \
1>infer_result 2>infer.log
```
# Notes
Currently, beam search will forward the encoder multiple times when predicting each target word, which requires extra computations. And we will fix it later.
#coding=utf-8
import sys
import time
import numpy as np
class BeamSearch(object):
"""
Generate sequence by beam search
NOTE: this class only implements generating one sentence at a time.
"""
def __init__(self,
inferer,
trg_dict,
pos_size,
padding_num,
beam_size=1,
max_len=100):
self.inferer = inferer
self.trg_dict = trg_dict
self.word_padding = trg_dict.__len__()
self.pos_size = pos_size
self.pos_padding = pos_size
self.padding_num = padding_num
self.win_len = padding_num + 1
self.max_len = max_len
self.beam_size = beam_size
def get_beam_input(self, pre_beam_list, infer_data):
"""
Get input for generation at the current iteration.
"""
beam_input = []
if len(pre_beam_list) == 0:
cur_trg = [self.word_padding
] * self.padding_num + [self.trg_dict['<s>']]
cur_trg_pos = [self.pos_padding] * self.padding_num + [0]
beam_input.append(infer_data + [cur_trg] + [cur_trg_pos])
else:
for seq in pre_beam_list:
if len(seq) < self.win_len:
cur_trg = [self.word_padding] * (
self.win_len - len(seq) - 1
) + [self.trg_dict['<s>']] + seq
cur_trg_pos = [self.pos_padding] * (
self.win_len - len(seq) - 1) + [0] + range(1,
len(seq) + 1)
else:
cur_trg = seq[-self.win_len:]
cur_trg_pos = range(
len(seq) + 1 - self.win_len, len(seq) + 1)
beam_input.append(infer_data + [cur_trg] + [cur_trg_pos])
return beam_input
def get_prob(self, beam_input):
"""
Get the probabilities of all possible tokens.
"""
row_list = [j * self.win_len for j in range(len(beam_input))]
prob = self.inferer.infer(beam_input, field='value')[row_list, :]
return prob
def get_candidate(self, pre_beam_list, pre_beam_score, prob):
"""
Get top beam_size tokens and their scores for each beam.
"""
if prob.ndim == 1:
candidate_id = prob.argsort()[-self.beam_size:][::-1]
candidate_log_prob = np.log(prob[candidate_id])
else:
candidate_id = prob.argsort()[:, -self.beam_size:][:, ::-1]
candidate_log_prob = np.zeros_like(candidate_id).astype('float32')
for j in range(len(pre_beam_list)):
candidate_log_prob[j, :] = np.log(prob[j, candidate_id[j, :]])
if pre_beam_score.size > 0:
candidate_score = candidate_log_prob + pre_beam_score.reshape(
(pre_beam_score.size, 1))
else:
candidate_score = candidate_log_prob
return candidate_id, candidate_score
def prune(self, candidate_id, candidate_score, pre_beam_list,
completed_seq_list, completed_seq_score, completed_seq_min_score):
"""
Pruning process of the beam search. During the process, beam_size most possible sequences
are selected for the beam in the next iteration. Besides, their scores and the minimum score
of the completed sequences are updated.
"""
candidate_id = candidate_id.flatten()
candidate_score = candidate_score.flatten()
topk_idx = candidate_score.argsort()[-self.beam_size:][::-1].tolist()
topk_seq_idx = [idx / self.beam_size for idx in topk_idx]
next_beam = []
beam_score = []
for j in range(len(topk_idx)):
if candidate_id[topk_idx[j]] == self.trg_dict['<e>']:
if len(
completed_seq_list
) < self.beam_size or completed_seq_min_score <= candidate_score[
topk_idx[j]]:
completed_seq_list.append(pre_beam_list[topk_seq_idx[j]])
completed_seq_score.append(candidate_score[topk_idx[j]])
if completed_seq_min_score is None or (
completed_seq_min_score >=
candidate_score[topk_idx[j]] and
len(completed_seq_list) < self.beam_size):
completed_seq_min_score = candidate_score[topk_idx[j]]
else:
seq = pre_beam_list[topk_seq_idx[
j]] + [candidate_id[topk_idx[j]]]
score = candidate_score[topk_idx[j]]
next_beam.append(seq)
beam_score.append(score)
beam_score = np.array(beam_score)
return next_beam, beam_score, completed_seq_min_score
def search_one_sample(self, infer_data):
"""
Beam search process for one sample.
"""
completed_seq_list = []
completed_seq_score = []
completed_seq_min_score = None
uncompleted_seq_list = [[]]
uncompleted_seq_score = np.zeros(0)
for i in xrange(self.max_len):
beam_input = self.get_beam_input(uncompleted_seq_list, infer_data)
prob = self.get_prob(beam_input)
candidate_id, candidate_score = self.get_candidate(
uncompleted_seq_list, uncompleted_seq_score, prob)
uncompleted_seq_list, uncompleted_seq_score, completed_seq_min_score = self.prune(
candidate_id, candidate_score, uncompleted_seq_list,
completed_seq_list, completed_seq_score,
completed_seq_min_score)
if len(uncompleted_seq_list) == 0:
break
if len(completed_seq_list) >= self.beam_size:
seq_max_score = uncompleted_seq_score.max()
if seq_max_score < completed_seq_min_score:
uncompleted_seq_list = []
break
final_seq_list = completed_seq_list + uncompleted_seq_list
final_score = np.concatenate(
(np.array(completed_seq_score), uncompleted_seq_score))
max_id = final_score.argmax()
top_seq = final_seq_list[max_id]
return top_seq
#coding=utf-8
import sys
import argparse
import distutils.util
import gzip
import paddle.v2 as paddle
from model import conv_seq2seq
from beamsearch import BeamSearch
import reader
def parse_args():
parser = argparse.ArgumentParser(
description="PaddlePaddle Convolutional Seq2Seq")
parser.add_argument(
'--infer_data_path',
type=str,
required=True,
help="Path of the dataset for inference")
parser.add_argument(
'--src_dict_path',
type=str,
required=True,
help='Path of the source dictionary')
parser.add_argument(
'--trg_dict_path',
type=str,
required=True,
help='path of the target dictionary')
parser.add_argument(
'--enc_blocks', type=str, help='Convolution blocks of the encoder')
parser.add_argument(
'--dec_blocks', type=str, help='Convolution blocks of the decoder')
parser.add_argument(
'--emb_size',
type=int,
default=512,
help='Dimension of word embedding. (default: %(default)s)')
parser.add_argument(
'--pos_size',
type=int,
default=200,
help='Total number of the position indexes. (default: %(default)s)')
parser.add_argument(
'--drop_rate',
type=float,
default=0.,
help='Dropout rate. (default: %(default)s)')
parser.add_argument(
"--use_gpu",
default=False,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=1,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
'--max_len',
type=int,
default=100,
help="The maximum length of the sentence to be generated. (default: %(default)s)"
)
parser.add_argument(
"--beam_size",
default=1,
type=int,
help="The width of beam expasion. (default: %(default)s)")
parser.add_argument(
"--model_path",
type=str,
required=True,
help="The path of trained model. (default: %(default)s)")
return parser.parse_args()
def to_sentence(seq, dictionary):
raw_sentence = [dictionary[id] for id in seq]
sentence = " ".join(raw_sentence)
return sentence
def infer(infer_data_path,
src_dict_path,
trg_dict_path,
model_path,
enc_conv_blocks,
dec_conv_blocks,
emb_dim=512,
pos_size=200,
drop_rate=0.,
max_len=100,
beam_size=1):
"""
Inference.
:param infer_data_path: The path of the data for inference.
:type infer_data_path: str
:param src_dict_path: The path of the source dictionary.
:type src_dict_path: str
:param trg_dict_path: The path of the target dictionary.
:type trg_dict_path: str
:param model_path: The path of a trained model.
:type model_path: str
:param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of
the list contains output dimension and context length of the corresponding
convolution block.
:type enc_conv_blocks: list of tuple
:param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of
the list contains output dimension and context length of the corresponding
convolution block.
:type dec_conv_blocks: list of tuple
:param emb_dim: The dimension of the embedding vector.
:type emb_dim: int
:param pos_size: The total number of the position indexes, which means
the maximum value of the index is pos_size - 1.
:type pos_size: int
:param drop_rate: Dropout rate.
:type drop_rate: float
:param max_len: The maximum length of the sentence to be generated.
:type max_len: int
:param beam_size: The width of beam expansion.
:type beam_size: int
"""
# load dict
src_dict = reader.load_dict(src_dict_path)
trg_dict = reader.load_dict(trg_dict_path)
src_dict_size = src_dict.__len__()
trg_dict_size = trg_dict.__len__()
prob = conv_seq2seq(
src_dict_size=src_dict_size,
trg_dict_size=trg_dict_size,
pos_size=pos_size,
emb_dim=emb_dim,
enc_conv_blocks=enc_conv_blocks,
dec_conv_blocks=dec_conv_blocks,
drop_rate=drop_rate,
is_infer=True)
# load parameters
parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks]
padding_num = reduce(lambda x, y: x + y, padding_list)
infer_reader = reader.data_reader(
data_file=infer_data_path,
src_dict=src_dict,
trg_dict=trg_dict,
pos_size=pos_size,
padding_num=padding_num)
inferer = paddle.inference.Inference(
output_layer=prob, parameters=parameters)
searcher = BeamSearch(
inferer=inferer,
trg_dict=trg_dict,
pos_size=pos_size,
padding_num=padding_num,
max_len=max_len,
beam_size=beam_size)
reverse_trg_dict = reader.get_reverse_dict(trg_dict)
for i, raw_data in enumerate(infer_reader()):
infer_data = [raw_data[0], raw_data[1]]
result = searcher.search_one_sample(infer_data)
sentence = to_sentence(result, reverse_trg_dict)
print sentence
sys.stdout.flush()
return
def main():
args = parse_args()
enc_conv_blocks = eval(args.enc_blocks)
dec_conv_blocks = eval(args.dec_blocks)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
infer(
infer_data_path=args.infer_data_path,
src_dict_path=args.src_dict_path,
trg_dict_path=args.trg_dict_path,
model_path=args.model_path,
enc_conv_blocks=enc_conv_blocks,
dec_conv_blocks=dec_conv_blocks,
emb_dim=args.emb_size,
pos_size=args.pos_size,
drop_rate=args.drop_rate,
max_len=args.max_len,
beam_size=args.beam_size)
if __name__ == '__main__':
main()
#coding=utf-8
import math
import paddle.v2 as paddle
__all__ = ["conv_seq2seq"]
def gated_conv_with_batchnorm(input,
size,
context_len,
context_start=None,
learning_rate=1.0,
drop_rate=0.):
"""
Definition of the convolution block.
:param input: The input of this block.
:type input: LayerOutput
:param size: The dimension of the block's output.
:type size: int
:param context_len: The context length of the convolution.
:type context_len: int
:param context_start: The start position of the context.
:type context_start: int
:param learning_rate: The learning rate factor of the parameters in the block.
The actual learning rate is the product of the global
learning rate and this factor.
:type learning_rate: float
:param drop_rate: Dropout rate.
:type drop_rate: float
:return: The output of the convolution block.
:rtype: LayerOutput
"""
input = paddle.layer.dropout(input=input, dropout_rate=drop_rate)
context = paddle.layer.mixed(
size=input.size * context_len,
input=paddle.layer.context_projection(
input=input, context_len=context_len, context_start=context_start))
raw_conv = paddle.layer.fc(
input=context,
size=size * 2,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(
initial_mean=0.,
initial_std=math.sqrt(4.0 * (1.0 - drop_rate) / context.size),
learning_rate=learning_rate),
bias_attr=False)
batch_norm_conv = paddle.layer.batch_norm(
input=raw_conv,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(learning_rate=learning_rate))
with paddle.layer.mixed(size=size) as conv:
conv += paddle.layer.identity_projection(
batch_norm_conv, size=size, offset=0)
with paddle.layer.mixed(size=size, act=paddle.activation.Sigmoid()) as gate:
gate += paddle.layer.identity_projection(
batch_norm_conv, size=size, offset=size)
with paddle.layer.mixed(size=size) as gated_conv:
gated_conv += paddle.layer.dotmul_operator(conv, gate)
return gated_conv
def encoder(token_emb,
pos_emb,
conv_blocks=[(256, 3)] * 5,
num_attention=3,
drop_rate=0.1):
"""
Definition of the encoder.
:param token_emb: The embedding vector of the input token.
:type token_emb: LayerOutput
:param pos_emb: The embedding vector of the input token's position.
:type pos_emb: LayerOutput
:param conv_blocks: The scale list of the convolution blocks. Each element of
the list contains output dimension and context length of
the corresponding convolution block.
:type conv_blocks: list of tuple
:param num_attention: The total number of the attention modules used in the decoder.
:type num_attention: int
:param drop_rate: Dropout rate.
:type drop_rate: float
:return: The input token encoding.
:rtype: LayerOutput
"""
embedding = paddle.layer.addto(
input=[token_emb, pos_emb],
layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
proj_size = conv_blocks[0][0]
block_input = paddle.layer.fc(
input=embedding,
size=proj_size,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(
initial_mean=0.,
initial_std=math.sqrt((1.0 - drop_rate) / embedding.size),
learning_rate=1.0 / (2.0 * num_attention)),
bias_attr=True, )
for (size, context_len) in conv_blocks:
if block_input.size == size:
residual = block_input
else:
residual = paddle.layer.fc(
input=block_input,
size=size,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(learning_rate=1.0 /
(2.0 * num_attention)),
bias_attr=True)
gated_conv = gated_conv_with_batchnorm(
input=block_input,
size=size,
context_len=context_len,
learning_rate=1.0 / (2.0 * num_attention),
drop_rate=drop_rate)
with paddle.layer.mixed(size=size) as block_output:
block_output += paddle.layer.identity_projection(residual)
block_output += paddle.layer.identity_projection(gated_conv)
# halve the variance of the sum
block_output = paddle.layer.slope_intercept(
input=block_output, slope=math.sqrt(0.5))
block_input = block_output
emb_dim = embedding.size
encoded_vec = paddle.layer.fc(
input=block_output,
size=emb_dim,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(learning_rate=1.0 / (2.0 * num_attention)),
bias_attr=True)
encoded_sum = paddle.layer.addto(input=[encoded_vec, embedding])
# halve the variance of the sum
encoded_sum = paddle.layer.slope_intercept(input=encoded_sum, slope=math.sqrt(0.5))
return encoded_vec, encoded_sum
def attention(decoder_state, cur_embedding, encoded_vec, encoded_sum):
"""
Definition of the attention.
:param decoder_state: The hidden state of the decoder.
:type decoder_state: LayerOutput
:param cur_embedding: The embedding vector of the current token.
:type cur_embedding: LayerOutput
:param encoded_vec: The source token encoding.
:type encoded_vec: LayerOutput
:param encoded_sum: The sum of the source token's encoding and embedding.
:type encoded_sum: LayerOutput
:return: A context vector.
:rtype: LayerOutput
"""
residual = decoder_state
state_size = decoder_state.size
emb_dim = cur_embedding.size
with paddle.layer.mixed(size=emb_dim, bias_attr=True) as state_summary:
state_summary += paddle.layer.full_matrix_projection(decoder_state)
state_summary += paddle.layer.identity_projection(cur_embedding)
# halve the variance of the sum
state_summary = paddle.layer.slope_intercept(
input=state_summary, slope=math.sqrt(0.5))
expanded = paddle.layer.expand(input=state_summary, expand_as=encoded_vec)
m = paddle.layer.linear_comb(weights=expanded, vectors=encoded_vec)
attention_weight = paddle.layer.fc(
input=m,
size=1,
act=paddle.activation.SequenceSoftmax(),
bias_attr=False)
scaled = paddle.layer.scaling(weight=attention_weight, input=encoded_sum)
attended = paddle.layer.pooling(
input=scaled, pooling_type=paddle.pooling.Sum())
attended_proj = paddle.layer.fc(
input=attended,
size=state_size,
act=paddle.activation.Linear(),
bias_attr=True)
attention_result = paddle.layer.addto(input=[attended_proj, residual])
# halve the variance of the sum
attention_result = paddle.layer.slope_intercept(
input=attention_result, slope=math.sqrt(0.5))
return attention_result
def decoder(token_emb,
pos_emb,
encoded_vec,
encoded_sum,
dict_size,
conv_blocks=[(256, 3)] * 3,
drop_rate=0.1):
"""
Definition of the decoder.
:param token_emb: The embedding vector of the input token.
:type token_emb: LayerOutput
:param pos_emb: The embedding vector of the input token's position.
:type pos_emb: LayerOutput
:param encoded_vec: The source token encoding.
:type encoded_vec: LayerOutput
:param encoded_sum: The sum of the source token's encoding and embedding.
:type encoded_sum: LayerOutput
:param dict_size: The size of the target dictionary.
:type dict_size: int
:param conv_blocks: The scale list of the convolution blocks. Each element
of the list contains output dimension and context length
of the corresponding convolution block.
:type conv_blocks: list of tuple
:param drop_rate: Dropout rate.
:type drop_rate: float
:return: The probability of the predicted token.
:rtype: LayerOutput
"""
def attention_step(decoder_state, cur_embedding, encoded_vec, encoded_sum):
conditional = attention(
decoder_state=decoder_state,
cur_embedding=cur_embedding,
encoded_vec=encoded_vec,
encoded_sum=encoded_sum)
return conditional
embedding = paddle.layer.addto(
input=[token_emb, pos_emb],
layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
proj_size = conv_blocks[0][0]
block_input = paddle.layer.fc(
input=embedding,
size=proj_size,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(
initial_mean=0.,
initial_std=math.sqrt((1.0 - drop_rate) / embedding.size)),
bias_attr=True, )
for (size, context_len) in conv_blocks:
if block_input.size == size:
residual = block_input
else:
residual = paddle.layer.fc(
input=block_input,
size=size,
act=paddle.activation.Linear(),
bias_attr=True)
decoder_state = gated_conv_with_batchnorm(
input=block_input,
size=size,
context_len=context_len,
context_start=0,
drop_rate=drop_rate)
group_inputs = [
decoder_state,
embedding,
paddle.layer.StaticInput(input=encoded_vec),
paddle.layer.StaticInput(input=encoded_sum),
]
conditional = paddle.layer.recurrent_group(
step=attention_step, input=group_inputs)
block_output = paddle.layer.addto(input=[conditional, residual])
# halve the variance of the sum
block_output = paddle.layer.slope_intercept(
input=block_output, slope=math.sqrt(0.5))
block_input = block_output
out_emb_dim = embedding.size
block_output = paddle.layer.fc(
input=block_output,
size=out_emb_dim,
act=paddle.activation.Linear(),
layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
decoder_out = paddle.layer.fc(
input=block_output,
size=dict_size,
act=paddle.activation.Softmax(),
param_attr=paddle.attr.Param(
initial_mean=0.,
initial_std=math.sqrt((1.0 - drop_rate) / block_output.size)),
bias_attr=True)
return decoder_out
def conv_seq2seq(src_dict_size,
trg_dict_size,
pos_size,
emb_dim,
enc_conv_blocks=[(256, 3)] * 5,
dec_conv_blocks=[(256, 3)] * 3,
drop_rate=0.1,
is_infer=False):
"""
Definition of convolutional sequence-to-sequence network.
:param src_dict_size: The size of the source dictionary.
:type src_dict_size: int
:param trg_dict_size: The size of the target dictionary.
:type trg_dict_size: int
:param pos_size: The total number of the position indexes, which means
the maximum value of the index is pos_size - 1.
:type pos_size: int
:param emb_dim: The dimension of the embedding vector.
:type emb_dim: int
:param enc_conv_blocks: The scale list of the encoder's convolution blocks. Each element
of the list contains output dimension and context length of the
corresponding convolution block.
:type enc_conv_blocks: list of tuple
:param dec_conv_blocks: The scale list of the decoder's convolution blocks. Each element
of the list contains output dimension and context length of the
corresponding convolution block.
:type dec_conv_blocks: list of tuple
:param drop_rate: Dropout rate.
:type drop_rate: float
:param is_infer: Whether infer or not.
:type is_infer: bool
:return: Cost or output layer.
:rtype: LayerOutput
"""
src = paddle.layer.data(
name='src_word',
type=paddle.data_type.integer_value_sequence(src_dict_size))
src_pos = paddle.layer.data(
name='src_word_pos',
type=paddle.data_type.integer_value_sequence(pos_size +
1)) # one for padding
src_emb = paddle.layer.embedding(
input=src,
size=emb_dim,
name='src_word_emb',
param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
src_pos_emb = paddle.layer.embedding(
input=src_pos,
size=emb_dim,
name='src_pos_emb',
param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
num_attention = len(dec_conv_blocks)
encoded_vec, encoded_sum = encoder(
token_emb=src_emb,
pos_emb=src_pos_emb,
conv_blocks=enc_conv_blocks,
num_attention=num_attention,
drop_rate=drop_rate)
trg = paddle.layer.data(
name='trg_word',
type=paddle.data_type.integer_value_sequence(trg_dict_size +
1)) # one for padding
trg_pos = paddle.layer.data(
name='trg_word_pos',
type=paddle.data_type.integer_value_sequence(pos_size +
1)) # one for padding
trg_emb = paddle.layer.embedding(
input=trg,
size=emb_dim,
name='trg_word_emb',
param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
trg_pos_emb = paddle.layer.embedding(
input=trg_pos,
size=emb_dim,
name='trg_pos_emb',
param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
decoder_out = decoder(
token_emb=trg_emb,
pos_emb=trg_pos_emb,
encoded_vec=encoded_vec,
encoded_sum=encoded_sum,
dict_size=trg_dict_size,
conv_blocks=dec_conv_blocks,
drop_rate=drop_rate)
if is_infer:
return decoder_out
trg_next_word = paddle.layer.data(
name='trg_next_word',
type=paddle.data_type.integer_value_sequence(trg_dict_size))
cost = paddle.layer.classification_cost(
input=decoder_out, label=trg_next_word)
return cost
#coding=utf-8
import random
def load_dict(dict_file):
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
return word_dict
def get_reverse_dict(dictionary):
reverse_dict = {dictionary[k]: k for k in dictionary.keys()}
return reverse_dict
def load_data(data_file, src_dict, trg_dict):
UNK_IDX = src_dict['<unk>']
with open(data_file, 'r') as f:
for line in f:
line_split = line.strip().split('\t')
if len(line_split) < 2:
continue
src, trg = line_split
src_words = src.strip().split()
trg_words = trg.strip().split()
src_seq = [src_dict.get(w, UNK_IDX) for w in src_words]
trg_seq = [trg_dict.get(w, UNK_IDX) for w in trg_words]
yield src_seq, trg_seq
def data_reader(data_file, src_dict, trg_dict, pos_size, padding_num):
def reader():
UNK_IDX = src_dict['<unk>']
word_padding = trg_dict.__len__()
pos_padding = pos_size
def _get_pos(pos_list, pos_size, pos_padding):
return [pos if pos < pos_size else pos_padding for pos in pos_list]
with open(data_file, 'r') as f:
for line in f:
line_split = line.strip().split('\t')
if len(line_split) != 2:
continue
src, trg = line_split
src = src.strip().split()
src_word = [src_dict.get(w, UNK_IDX) for w in src]
src_word_pos = range(len(src_word))
src_word_pos = _get_pos(src_word_pos, pos_size, pos_padding)
trg = trg.strip().split()
trg_word = [trg_dict['<s>']
] + [trg_dict.get(w, UNK_IDX) for w in trg]
trg_word_pos = range(len(trg_word))
trg_word_pos = _get_pos(trg_word_pos, pos_size, pos_padding)
trg_next_word = trg_word[1:] + [trg_dict['<e>']]
trg_word = [word_padding] * padding_num + trg_word
trg_word_pos = [pos_padding] * padding_num + trg_word_pos
trg_next_word = trg_next_word + [trg_dict['<e>']] * padding_num
yield src_word, src_word_pos, trg_word, trg_word_pos, trg_next_word
return reader
#coding=utf-8
import os
import sys
import time
import argparse
import distutils.util
import gzip
import numpy as np
import paddle.v2 as paddle
from model import conv_seq2seq
import reader
def parse_args():
parser = argparse.ArgumentParser(
description="PaddlePaddle Convolutional Seq2Seq")
parser.add_argument(
'--train_data_path',
type=str,
required=True,
help="Path of the training set")
parser.add_argument(
'--test_data_path', type=str, help='Path of the test set')
parser.add_argument(
'--src_dict_path',
type=str,
required=True,
help='Path of source dictionary')
parser.add_argument(
'--trg_dict_path',
type=str,
required=True,
help='Path of target dictionary')
parser.add_argument(
'--enc_blocks', type=str, help='Convolution blocks of the encoder')
parser.add_argument(
'--dec_blocks', type=str, help='Convolution blocks of the decoder')
parser.add_argument(
'--emb_size',
type=int,
default=512,
help='Dimension of word embedding. (default: %(default)s)')
parser.add_argument(
'--pos_size',
type=int,
default=200,
help='Total number of the position indexes. (default: %(default)s)')
parser.add_argument(
'--drop_rate',
type=float,
default=0.,
help='Dropout rate. (default: %(default)s)')
parser.add_argument(
"--use_gpu",
default=False,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=1,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
'--batch_size',
type=int,
default=32,
help="Size of a mini-batch. (default: %(default)s)")
parser.add_argument(
'--num_passes',
type=int,
default=15,
help="Number of passes to train. (default: %(default)s)")
return parser.parse_args()
def create_reader(padding_num,
train_data_path,
test_data_path=None,
src_dict=None,
trg_dict=None,
pos_size=200,
batch_size=32):
train_reader = paddle.batch(
reader=paddle.reader.shuffle(
reader=reader.data_reader(
data_file=train_data_path,
src_dict=src_dict,
trg_dict=trg_dict,
pos_size=pos_size,
padding_num=padding_num),
buf_size=10240),
batch_size=batch_size)
test_reader = None
if test_data_path:
test_reader = paddle.batch(
reader=paddle.reader.shuffle(
reader=reader.data_reader(
data_file=test_data_path,
src_dict=src_dict,
trg_dict=trg_dict,
pos_size=pos_size,
padding_num=padding_num),
buf_size=10240),
batch_size=batch_size)
return train_reader, test_reader
def train(train_data_path,
test_data_path,
src_dict_path,
trg_dict_path,
enc_conv_blocks,
dec_conv_blocks,
emb_dim=512,
pos_size=200,
drop_rate=0.,
batch_size=32,
num_passes=15):
"""
Train the convolution sequence-to-sequence model.
:param train_data_path: The path of the training set.
:type train_data_path: str
:param test_data_path: The path of the test set.
:type test_data_path: str
:param src_dict_path: The path of the source dictionary.
:type src_dict_path: str
:param trg_dict_path: The path of the target dictionary.
:type trg_dict_path: str
:param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of
the list contains output dimension and context length of the corresponding
convolution block.
:type enc_conv_blocks: list of tuple
:param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of
the list contains output dimension and context length of the corresponding
convolution block.
:type dec_conv_blocks: list of tuple
:param emb_dim: The dimension of the embedding vector.
:type emb_dim: int
:param pos_size: The total number of the position indexes, which means
the maximum value of the index is pos_size - 1.
:type pos_size: int
:param drop_rate: Dropout rate.
:type drop_rate: float
:param batch_size: The size of a mini-batch.
:type batch_size: int
:param num_passes: The total number of the passes to train.
:type num_passes: int
"""
# load dict
src_dict = reader.load_dict(src_dict_path)
trg_dict = reader.load_dict(trg_dict_path)
src_dict_size = src_dict.__len__()
trg_dict_size = trg_dict.__len__()
optimizer = paddle.optimizer.Adam(
learning_rate=1e-3, )
cost = conv_seq2seq(
src_dict_size=src_dict_size,
trg_dict_size=trg_dict_size,
pos_size=pos_size,
emb_dim=emb_dim,
enc_conv_blocks=enc_conv_blocks,
dec_conv_blocks=dec_conv_blocks,
drop_rate=drop_rate,
is_infer=False)
# create parameters and trainer
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks]
padding_num = reduce(lambda x, y: x + y, padding_list)
train_reader, test_reader = create_reader(
padding_num=padding_num,
train_data_path=train_data_path,
test_data_path=test_data_path,
src_dict=src_dict,
trg_dict=trg_dict,
pos_size=pos_size,
batch_size=batch_size)
feeding = {
'src_word': 0,
'src_word_pos': 1,
'trg_word': 2,
'trg_word_pos': 3,
'trg_next_word': 4
}
# create event handler
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 20 == 0:
cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime())
print "[%s]: Pass: %d, Batch: %d, TrainCost: %f, %s" % (
cur_time, event.pass_id, event.batch_id, event.cost,
event.metrics)
else:
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
if test_reader is not None:
cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime())
result = trainer.test(reader=test_reader, feeding=feeding)
print "[%s]: Pass: %d, TestCost: %f, %s" % (
cur_time, event.pass_id, result.cost, result.metrics)
sys.stdout.flush()
with gzip.open("output/params.pass-%d.tar.gz" % event.pass_id,
'w') as f:
trainer.save_parameter_to_tar(f)
if not os.path.exists('output'):
os.mkdir('output')
trainer.train(
reader=train_reader,
event_handler=event_handler,
num_passes=num_passes,
feeding=feeding)
def main():
args = parse_args()
enc_conv_blocks = eval(args.enc_blocks)
dec_conv_blocks = eval(args.dec_blocks)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
train(
train_data_path=args.train_data_path,
test_data_path=args.test_data_path,
src_dict_path=args.src_dict_path,
trg_dict_path=args.trg_dict_path,
enc_conv_blocks=enc_conv_blocks,
dec_conv_blocks=dec_conv_blocks,
emb_dim=args.emb_size,
pos_size=args.pos_size,
drop_rate=args.drop_rate,
batch_size=args.batch_size,
num_passes=args.num_passes)
if __name__ == '__main__':
main()
# Convolutional Sequence to Sequence Learning
This model implements the work in the following paper:
Jonas Gehring, Micheal Auli, David Grangier, et al. Convolutional Sequence to Sequence Learning. Association for Computational Linguistics (ACL), 2017
# Training a Model
- Modify the following script if needed and then run:
```bash
python train.py \
--train_data_path ./data/train_data \
--test_data_path ./data/test_data \
--src_dict_path ./data/src_dict \
--trg_dict_path ./data/trg_dict \
--enc_blocks "[(256, 3)] * 5" \
--dec_blocks "[(256, 3)] * 3" \
--emb_size 256 \
--pos_size 200 \
--drop_rate 0.1 \
--use_gpu False \
--trainer_count 1 \
--batch_size 32 \
--num_passes 20 \
>train.log 2>&1
```
# Inferring by a Trained Model
- Infer by a trained model by running:
```bash
python infer.py \
--infer_data_path ./data/infer_data \
--src_dict_path ./data/src_dict \
--trg_dict_path ./data/trg_dict \
--enc_blocks "[(256, 3)] * 5" \
--dec_blocks "[(256, 3)] * 3" \
--emb_size 256 \
--pos_size 200 \
--drop_rate 0.1 \
--use_gpu False \
--trainer_count 1 \
--max_len 100 \
--beam_size 1 \
--model_path ./params.pass-0.tar.gz \
1>infer_result 2>infer.log
```
# Notes
Currently, beam search will forward the encoder multiple times when predicting each target word, which requires extra computations. And we will fix it later.
#coding=utf-8
import sys
import time
import numpy as np
class BeamSearch(object):
"""
Generate sequence by beam search
NOTE: this class only implements generating one sentence at a time.
"""
def __init__(self,
inferer,
trg_dict,
pos_size,
padding_num,
beam_size=1,
max_len=100):
self.inferer = inferer
self.trg_dict = trg_dict
self.word_padding = trg_dict.__len__()
self.pos_size = pos_size
self.pos_padding = pos_size
self.padding_num = padding_num
self.win_len = padding_num + 1
self.max_len = max_len
self.beam_size = beam_size
def get_beam_input(self, pre_beam_list, infer_data):
"""
Get input for generation at the current iteration.
"""
beam_input = []
if len(pre_beam_list) == 0:
cur_trg = [self.word_padding
] * self.padding_num + [self.trg_dict['<s>']]
cur_trg_pos = [self.pos_padding] * self.padding_num + [0]
beam_input.append(infer_data + [cur_trg] + [cur_trg_pos])
else:
for seq in pre_beam_list:
if len(seq) < self.win_len:
cur_trg = [self.word_padding] * (
self.win_len - len(seq) - 1
) + [self.trg_dict['<s>']] + seq
cur_trg_pos = [self.pos_padding] * (
self.win_len - len(seq) - 1) + [0] + range(1,
len(seq) + 1)
else:
cur_trg = seq[-self.win_len:]
cur_trg_pos = range(
len(seq) + 1 - self.win_len, len(seq) + 1)
beam_input.append(infer_data + [cur_trg] + [cur_trg_pos])
return beam_input
def get_prob(self, beam_input):
"""
Get the probabilities of all possible tokens.
"""
row_list = [j * self.win_len for j in range(len(beam_input))]
prob = self.inferer.infer(beam_input, field='value')[row_list, :]
return prob
def get_candidate(self, pre_beam_list, pre_beam_score, prob):
"""
Get top beam_size tokens and their scores for each beam.
"""
if prob.ndim == 1:
candidate_id = prob.argsort()[-self.beam_size:][::-1]
candidate_log_prob = np.log(prob[candidate_id])
else:
candidate_id = prob.argsort()[:, -self.beam_size:][:, ::-1]
candidate_log_prob = np.zeros_like(candidate_id).astype('float32')
for j in range(len(pre_beam_list)):
candidate_log_prob[j, :] = np.log(prob[j, candidate_id[j, :]])
if pre_beam_score.size > 0:
candidate_score = candidate_log_prob + pre_beam_score.reshape(
(pre_beam_score.size, 1))
else:
candidate_score = candidate_log_prob
return candidate_id, candidate_score
def prune(self, candidate_id, candidate_score, pre_beam_list,
completed_seq_list, completed_seq_score, completed_seq_min_score):
"""
Pruning process of the beam search. During the process, beam_size most possible sequences
are selected for the beam in the next iteration. Besides, their scores and the minimum score
of the completed sequences are updated.
"""
candidate_id = candidate_id.flatten()
candidate_score = candidate_score.flatten()
topk_idx = candidate_score.argsort()[-self.beam_size:][::-1].tolist()
topk_seq_idx = [idx / self.beam_size for idx in topk_idx]
next_beam = []
beam_score = []
for j in range(len(topk_idx)):
if candidate_id[topk_idx[j]] == self.trg_dict['<e>']:
if len(
completed_seq_list
) < self.beam_size or completed_seq_min_score <= candidate_score[
topk_idx[j]]:
completed_seq_list.append(pre_beam_list[topk_seq_idx[j]])
completed_seq_score.append(candidate_score[topk_idx[j]])
if completed_seq_min_score is None or (
completed_seq_min_score >=
candidate_score[topk_idx[j]] and
len(completed_seq_list) < self.beam_size):
completed_seq_min_score = candidate_score[topk_idx[j]]
else:
seq = pre_beam_list[topk_seq_idx[
j]] + [candidate_id[topk_idx[j]]]
score = candidate_score[topk_idx[j]]
next_beam.append(seq)
beam_score.append(score)
beam_score = np.array(beam_score)
return next_beam, beam_score, completed_seq_min_score
def search_one_sample(self, infer_data):
"""
Beam search process for one sample.
"""
completed_seq_list = []
completed_seq_score = []
completed_seq_min_score = None
uncompleted_seq_list = [[]]
uncompleted_seq_score = np.zeros(0)
for i in xrange(self.max_len):
beam_input = self.get_beam_input(uncompleted_seq_list, infer_data)
prob = self.get_prob(beam_input)
candidate_id, candidate_score = self.get_candidate(
uncompleted_seq_list, uncompleted_seq_score, prob)
uncompleted_seq_list, uncompleted_seq_score, completed_seq_min_score = self.prune(
candidate_id, candidate_score, uncompleted_seq_list,
completed_seq_list, completed_seq_score,
completed_seq_min_score)
if len(uncompleted_seq_list) == 0:
break
if len(completed_seq_list) >= self.beam_size:
seq_max_score = uncompleted_seq_score.max()
if seq_max_score < completed_seq_min_score:
uncompleted_seq_list = []
break
final_seq_list = completed_seq_list + uncompleted_seq_list
final_score = np.concatenate(
(np.array(completed_seq_score), uncompleted_seq_score))
max_id = final_score.argmax()
top_seq = final_seq_list[max_id]
return top_seq
#coding=utf-8
import sys
import argparse
import distutils.util
import gzip
import paddle.v2 as paddle
from model import conv_seq2seq
from beamsearch import BeamSearch
import reader
def parse_args():
parser = argparse.ArgumentParser(
description="PaddlePaddle Convolutional Seq2Seq")
parser.add_argument(
'--infer_data_path',
type=str,
required=True,
help="Path of the dataset for inference")
parser.add_argument(
'--src_dict_path',
type=str,
required=True,
help='Path of the source dictionary')
parser.add_argument(
'--trg_dict_path',
type=str,
required=True,
help='path of the target dictionary')
parser.add_argument(
'--enc_blocks', type=str, help='Convolution blocks of the encoder')
parser.add_argument(
'--dec_blocks', type=str, help='Convolution blocks of the decoder')
parser.add_argument(
'--emb_size',
type=int,
default=512,
help='Dimension of word embedding. (default: %(default)s)')
parser.add_argument(
'--pos_size',
type=int,
default=200,
help='Total number of the position indexes. (default: %(default)s)')
parser.add_argument(
'--drop_rate',
type=float,
default=0.,
help='Dropout rate. (default: %(default)s)')
parser.add_argument(
"--use_gpu",
default=False,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=1,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
'--max_len',
type=int,
default=100,
help="The maximum length of the sentence to be generated. (default: %(default)s)"
)
parser.add_argument(
"--beam_size",
default=1,
type=int,
help="The width of beam expasion. (default: %(default)s)")
parser.add_argument(
"--model_path",
type=str,
required=True,
help="The path of trained model. (default: %(default)s)")
return parser.parse_args()
def to_sentence(seq, dictionary):
raw_sentence = [dictionary[id] for id in seq]
sentence = " ".join(raw_sentence)
return sentence
def infer(infer_data_path,
src_dict_path,
trg_dict_path,
model_path,
enc_conv_blocks,
dec_conv_blocks,
emb_dim=512,
pos_size=200,
drop_rate=0.,
max_len=100,
beam_size=1):
"""
Inference.
:param infer_data_path: The path of the data for inference.
:type infer_data_path: str
:param src_dict_path: The path of the source dictionary.
:type src_dict_path: str
:param trg_dict_path: The path of the target dictionary.
:type trg_dict_path: str
:param model_path: The path of a trained model.
:type model_path: str
:param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of
the list contains output dimension and context length of the corresponding
convolution block.
:type enc_conv_blocks: list of tuple
:param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of
the list contains output dimension and context length of the corresponding
convolution block.
:type dec_conv_blocks: list of tuple
:param emb_dim: The dimension of the embedding vector.
:type emb_dim: int
:param pos_size: The total number of the position indexes, which means
the maximum value of the index is pos_size - 1.
:type pos_size: int
:param drop_rate: Dropout rate.
:type drop_rate: float
:param max_len: The maximum length of the sentence to be generated.
:type max_len: int
:param beam_size: The width of beam expansion.
:type beam_size: int
"""
# load dict
src_dict = reader.load_dict(src_dict_path)
trg_dict = reader.load_dict(trg_dict_path)
src_dict_size = src_dict.__len__()
trg_dict_size = trg_dict.__len__()
prob = conv_seq2seq(
src_dict_size=src_dict_size,
trg_dict_size=trg_dict_size,
pos_size=pos_size,
emb_dim=emb_dim,
enc_conv_blocks=enc_conv_blocks,
dec_conv_blocks=dec_conv_blocks,
drop_rate=drop_rate,
is_infer=True)
# load parameters
parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks]
padding_num = reduce(lambda x, y: x + y, padding_list)
infer_reader = reader.data_reader(
data_file=infer_data_path,
src_dict=src_dict,
trg_dict=trg_dict,
pos_size=pos_size,
padding_num=padding_num)
inferer = paddle.inference.Inference(
output_layer=prob, parameters=parameters)
searcher = BeamSearch(
inferer=inferer,
trg_dict=trg_dict,
pos_size=pos_size,
padding_num=padding_num,
max_len=max_len,
beam_size=beam_size)
reverse_trg_dict = reader.get_reverse_dict(trg_dict)
for i, raw_data in enumerate(infer_reader()):
infer_data = [raw_data[0], raw_data[1]]
result = searcher.search_one_sample(infer_data)
sentence = to_sentence(result, reverse_trg_dict)
print sentence
sys.stdout.flush()
return
def main():
args = parse_args()
enc_conv_blocks = eval(args.enc_blocks)
dec_conv_blocks = eval(args.dec_blocks)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
infer(
infer_data_path=args.infer_data_path,
src_dict_path=args.src_dict_path,
trg_dict_path=args.trg_dict_path,
model_path=args.model_path,
enc_conv_blocks=enc_conv_blocks,
dec_conv_blocks=dec_conv_blocks,
emb_dim=args.emb_size,
pos_size=args.pos_size,
drop_rate=args.drop_rate,
max_len=args.max_len,
beam_size=args.beam_size)
if __name__ == '__main__':
main()
#coding=utf-8
import math
import paddle.v2 as paddle
__all__ = ["conv_seq2seq"]
def gated_conv_with_batchnorm(input,
size,
context_len,
context_start=None,
learning_rate=1.0,
drop_rate=0.):
"""
Definition of the convolution block.
:param input: The input of this block.
:type input: LayerOutput
:param size: The dimension of the block's output.
:type size: int
:param context_len: The context length of the convolution.
:type context_len: int
:param context_start: The start position of the context.
:type context_start: int
:param learning_rate: The learning rate factor of the parameters in the block.
The actual learning rate is the product of the global
learning rate and this factor.
:type learning_rate: float
:param drop_rate: Dropout rate.
:type drop_rate: float
:return: The output of the convolution block.
:rtype: LayerOutput
"""
input = paddle.layer.dropout(input=input, dropout_rate=drop_rate)
context = paddle.layer.mixed(
size=input.size * context_len,
input=paddle.layer.context_projection(
input=input, context_len=context_len, context_start=context_start))
raw_conv = paddle.layer.fc(
input=context,
size=size * 2,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(
initial_mean=0.,
initial_std=math.sqrt(4.0 * (1.0 - drop_rate) / context.size),
learning_rate=learning_rate),
bias_attr=False)
batch_norm_conv = paddle.layer.batch_norm(
input=raw_conv,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(learning_rate=learning_rate))
with paddle.layer.mixed(size=size) as conv:
conv += paddle.layer.identity_projection(
batch_norm_conv, size=size, offset=0)
with paddle.layer.mixed(size=size, act=paddle.activation.Sigmoid()) as gate:
gate += paddle.layer.identity_projection(
batch_norm_conv, size=size, offset=size)
with paddle.layer.mixed(size=size) as gated_conv:
gated_conv += paddle.layer.dotmul_operator(conv, gate)
return gated_conv
def encoder(token_emb,
pos_emb,
conv_blocks=[(256, 3)] * 5,
num_attention=3,
drop_rate=0.1):
"""
Definition of the encoder.
:param token_emb: The embedding vector of the input token.
:type token_emb: LayerOutput
:param pos_emb: The embedding vector of the input token's position.
:type pos_emb: LayerOutput
:param conv_blocks: The scale list of the convolution blocks. Each element of
the list contains output dimension and context length of
the corresponding convolution block.
:type conv_blocks: list of tuple
:param num_attention: The total number of the attention modules used in the decoder.
:type num_attention: int
:param drop_rate: Dropout rate.
:type drop_rate: float
:return: The input token encoding.
:rtype: LayerOutput
"""
embedding = paddle.layer.addto(
input=[token_emb, pos_emb],
layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
proj_size = conv_blocks[0][0]
block_input = paddle.layer.fc(
input=embedding,
size=proj_size,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(
initial_mean=0.,
initial_std=math.sqrt((1.0 - drop_rate) / embedding.size),
learning_rate=1.0 / (2.0 * num_attention)),
bias_attr=True, )
for (size, context_len) in conv_blocks:
if block_input.size == size:
residual = block_input
else:
residual = paddle.layer.fc(
input=block_input,
size=size,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(learning_rate=1.0 /
(2.0 * num_attention)),
bias_attr=True)
gated_conv = gated_conv_with_batchnorm(
input=block_input,
size=size,
context_len=context_len,
learning_rate=1.0 / (2.0 * num_attention),
drop_rate=drop_rate)
with paddle.layer.mixed(size=size) as block_output:
block_output += paddle.layer.identity_projection(residual)
block_output += paddle.layer.identity_projection(gated_conv)
# halve the variance of the sum
block_output = paddle.layer.slope_intercept(
input=block_output, slope=math.sqrt(0.5))
block_input = block_output
emb_dim = embedding.size
encoded_vec = paddle.layer.fc(
input=block_output,
size=emb_dim,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(learning_rate=1.0 / (2.0 * num_attention)),
bias_attr=True)
encoded_sum = paddle.layer.addto(input=[encoded_vec, embedding])
# halve the variance of the sum
encoded_sum = paddle.layer.slope_intercept(input=encoded_sum, slope=math.sqrt(0.5))
return encoded_vec, encoded_sum
def attention(decoder_state, cur_embedding, encoded_vec, encoded_sum):
"""
Definition of the attention.
:param decoder_state: The hidden state of the decoder.
:type decoder_state: LayerOutput
:param cur_embedding: The embedding vector of the current token.
:type cur_embedding: LayerOutput
:param encoded_vec: The source token encoding.
:type encoded_vec: LayerOutput
:param encoded_sum: The sum of the source token's encoding and embedding.
:type encoded_sum: LayerOutput
:return: A context vector.
:rtype: LayerOutput
"""
residual = decoder_state
state_size = decoder_state.size
emb_dim = cur_embedding.size
with paddle.layer.mixed(size=emb_dim, bias_attr=True) as state_summary:
state_summary += paddle.layer.full_matrix_projection(decoder_state)
state_summary += paddle.layer.identity_projection(cur_embedding)
# halve the variance of the sum
state_summary = paddle.layer.slope_intercept(
input=state_summary, slope=math.sqrt(0.5))
expanded = paddle.layer.expand(input=state_summary, expand_as=encoded_vec)
m = paddle.layer.linear_comb(weights=expanded, vectors=encoded_vec)
attention_weight = paddle.layer.fc(
input=m,
size=1,
act=paddle.activation.SequenceSoftmax(),
bias_attr=False)
scaled = paddle.layer.scaling(weight=attention_weight, input=encoded_sum)
attended = paddle.layer.pooling(
input=scaled, pooling_type=paddle.pooling.Sum())
attended_proj = paddle.layer.fc(
input=attended,
size=state_size,
act=paddle.activation.Linear(),
bias_attr=True)
attention_result = paddle.layer.addto(input=[attended_proj, residual])
# halve the variance of the sum
attention_result = paddle.layer.slope_intercept(
input=attention_result, slope=math.sqrt(0.5))
return attention_result
def decoder(token_emb,
pos_emb,
encoded_vec,
encoded_sum,
dict_size,
conv_blocks=[(256, 3)] * 3,
drop_rate=0.1):
"""
Definition of the decoder.
:param token_emb: The embedding vector of the input token.
:type token_emb: LayerOutput
:param pos_emb: The embedding vector of the input token's position.
:type pos_emb: LayerOutput
:param encoded_vec: The source token encoding.
:type encoded_vec: LayerOutput
:param encoded_sum: The sum of the source token's encoding and embedding.
:type encoded_sum: LayerOutput
:param dict_size: The size of the target dictionary.
:type dict_size: int
:param conv_blocks: The scale list of the convolution blocks. Each element
of the list contains output dimension and context length
of the corresponding convolution block.
:type conv_blocks: list of tuple
:param drop_rate: Dropout rate.
:type drop_rate: float
:return: The probability of the predicted token.
:rtype: LayerOutput
"""
def attention_step(decoder_state, cur_embedding, encoded_vec, encoded_sum):
conditional = attention(
decoder_state=decoder_state,
cur_embedding=cur_embedding,
encoded_vec=encoded_vec,
encoded_sum=encoded_sum)
return conditional
embedding = paddle.layer.addto(
input=[token_emb, pos_emb],
layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
proj_size = conv_blocks[0][0]
block_input = paddle.layer.fc(
input=embedding,
size=proj_size,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(
initial_mean=0.,
initial_std=math.sqrt((1.0 - drop_rate) / embedding.size)),
bias_attr=True, )
for (size, context_len) in conv_blocks:
if block_input.size == size:
residual = block_input
else:
residual = paddle.layer.fc(
input=block_input,
size=size,
act=paddle.activation.Linear(),
bias_attr=True)
decoder_state = gated_conv_with_batchnorm(
input=block_input,
size=size,
context_len=context_len,
context_start=0,
drop_rate=drop_rate)
group_inputs = [
decoder_state,
embedding,
paddle.layer.StaticInput(input=encoded_vec),
paddle.layer.StaticInput(input=encoded_sum),
]
conditional = paddle.layer.recurrent_group(
step=attention_step, input=group_inputs)
block_output = paddle.layer.addto(input=[conditional, residual])
# halve the variance of the sum
block_output = paddle.layer.slope_intercept(
input=block_output, slope=math.sqrt(0.5))
block_input = block_output
out_emb_dim = embedding.size
block_output = paddle.layer.fc(
input=block_output,
size=out_emb_dim,
act=paddle.activation.Linear(),
layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
decoder_out = paddle.layer.fc(
input=block_output,
size=dict_size,
act=paddle.activation.Softmax(),
param_attr=paddle.attr.Param(
initial_mean=0.,
initial_std=math.sqrt((1.0 - drop_rate) / block_output.size)),
bias_attr=True)
return decoder_out
def conv_seq2seq(src_dict_size,
trg_dict_size,
pos_size,
emb_dim,
enc_conv_blocks=[(256, 3)] * 5,
dec_conv_blocks=[(256, 3)] * 3,
drop_rate=0.1,
is_infer=False):
"""
Definition of convolutional sequence-to-sequence network.
:param src_dict_size: The size of the source dictionary.
:type src_dict_size: int
:param trg_dict_size: The size of the target dictionary.
:type trg_dict_size: int
:param pos_size: The total number of the position indexes, which means
the maximum value of the index is pos_size - 1.
:type pos_size: int
:param emb_dim: The dimension of the embedding vector.
:type emb_dim: int
:param enc_conv_blocks: The scale list of the encoder's convolution blocks. Each element
of the list contains output dimension and context length of the
corresponding convolution block.
:type enc_conv_blocks: list of tuple
:param dec_conv_blocks: The scale list of the decoder's convolution blocks. Each element
of the list contains output dimension and context length of the
corresponding convolution block.
:type dec_conv_blocks: list of tuple
:param drop_rate: Dropout rate.
:type drop_rate: float
:param is_infer: Whether infer or not.
:type is_infer: bool
:return: Cost or output layer.
:rtype: LayerOutput
"""
src = paddle.layer.data(
name='src_word',
type=paddle.data_type.integer_value_sequence(src_dict_size))
src_pos = paddle.layer.data(
name='src_word_pos',
type=paddle.data_type.integer_value_sequence(pos_size +
1)) # one for padding
src_emb = paddle.layer.embedding(
input=src,
size=emb_dim,
name='src_word_emb',
param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
src_pos_emb = paddle.layer.embedding(
input=src_pos,
size=emb_dim,
name='src_pos_emb',
param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
num_attention = len(dec_conv_blocks)
encoded_vec, encoded_sum = encoder(
token_emb=src_emb,
pos_emb=src_pos_emb,
conv_blocks=enc_conv_blocks,
num_attention=num_attention,
drop_rate=drop_rate)
trg = paddle.layer.data(
name='trg_word',
type=paddle.data_type.integer_value_sequence(trg_dict_size +
1)) # one for padding
trg_pos = paddle.layer.data(
name='trg_word_pos',
type=paddle.data_type.integer_value_sequence(pos_size +
1)) # one for padding
trg_emb = paddle.layer.embedding(
input=trg,
size=emb_dim,
name='trg_word_emb',
param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
trg_pos_emb = paddle.layer.embedding(
input=trg_pos,
size=emb_dim,
name='trg_pos_emb',
param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
decoder_out = decoder(
token_emb=trg_emb,
pos_emb=trg_pos_emb,
encoded_vec=encoded_vec,
encoded_sum=encoded_sum,
dict_size=trg_dict_size,
conv_blocks=dec_conv_blocks,
drop_rate=drop_rate)
if is_infer:
return decoder_out
trg_next_word = paddle.layer.data(
name='trg_next_word',
type=paddle.data_type.integer_value_sequence(trg_dict_size))
cost = paddle.layer.classification_cost(
input=decoder_out, label=trg_next_word)
return cost
#coding=utf-8
import random
def load_dict(dict_file):
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
return word_dict
def get_reverse_dict(dictionary):
reverse_dict = {dictionary[k]: k for k in dictionary.keys()}
return reverse_dict
def load_data(data_file, src_dict, trg_dict):
UNK_IDX = src_dict['<unk>']
with open(data_file, 'r') as f:
for line in f:
line_split = line.strip().split('\t')
if len(line_split) < 2:
continue
src, trg = line_split
src_words = src.strip().split()
trg_words = trg.strip().split()
src_seq = [src_dict.get(w, UNK_IDX) for w in src_words]
trg_seq = [trg_dict.get(w, UNK_IDX) for w in trg_words]
yield src_seq, trg_seq
def data_reader(data_file, src_dict, trg_dict, pos_size, padding_num):
def reader():
UNK_IDX = src_dict['<unk>']
word_padding = trg_dict.__len__()
pos_padding = pos_size
def _get_pos(pos_list, pos_size, pos_padding):
return [pos if pos < pos_size else pos_padding for pos in pos_list]
with open(data_file, 'r') as f:
for line in f:
line_split = line.strip().split('\t')
if len(line_split) != 2:
continue
src, trg = line_split
src = src.strip().split()
src_word = [src_dict.get(w, UNK_IDX) for w in src]
src_word_pos = range(len(src_word))
src_word_pos = _get_pos(src_word_pos, pos_size, pos_padding)
trg = trg.strip().split()
trg_word = [trg_dict['<s>']
] + [trg_dict.get(w, UNK_IDX) for w in trg]
trg_word_pos = range(len(trg_word))
trg_word_pos = _get_pos(trg_word_pos, pos_size, pos_padding)
trg_next_word = trg_word[1:] + [trg_dict['<e>']]
trg_word = [word_padding] * padding_num + trg_word
trg_word_pos = [pos_padding] * padding_num + trg_word_pos
trg_next_word = trg_next_word + [trg_dict['<e>']] * padding_num
yield src_word, src_word_pos, trg_word, trg_word_pos, trg_next_word
return reader
#coding=utf-8
import os
import sys
import time
import argparse
import distutils.util
import gzip
import numpy as np
import paddle.v2 as paddle
from model import conv_seq2seq
import reader
def parse_args():
parser = argparse.ArgumentParser(
description="PaddlePaddle Convolutional Seq2Seq")
parser.add_argument(
'--train_data_path',
type=str,
required=True,
help="Path of the training set")
parser.add_argument(
'--test_data_path', type=str, help='Path of the test set')
parser.add_argument(
'--src_dict_path',
type=str,
required=True,
help='Path of source dictionary')
parser.add_argument(
'--trg_dict_path',
type=str,
required=True,
help='Path of target dictionary')
parser.add_argument(
'--enc_blocks', type=str, help='Convolution blocks of the encoder')
parser.add_argument(
'--dec_blocks', type=str, help='Convolution blocks of the decoder')
parser.add_argument(
'--emb_size',
type=int,
default=512,
help='Dimension of word embedding. (default: %(default)s)')
parser.add_argument(
'--pos_size',
type=int,
default=200,
help='Total number of the position indexes. (default: %(default)s)')
parser.add_argument(
'--drop_rate',
type=float,
default=0.,
help='Dropout rate. (default: %(default)s)')
parser.add_argument(
"--use_gpu",
default=False,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=1,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
'--batch_size',
type=int,
default=32,
help="Size of a mini-batch. (default: %(default)s)")
parser.add_argument(
'--num_passes',
type=int,
default=15,
help="Number of passes to train. (default: %(default)s)")
return parser.parse_args()
def create_reader(padding_num,
train_data_path,
test_data_path=None,
src_dict=None,
trg_dict=None,
pos_size=200,
batch_size=32):
train_reader = paddle.batch(
reader=paddle.reader.shuffle(
reader=reader.data_reader(
data_file=train_data_path,
src_dict=src_dict,
trg_dict=trg_dict,
pos_size=pos_size,
padding_num=padding_num),
buf_size=10240),
batch_size=batch_size)
test_reader = None
if test_data_path:
test_reader = paddle.batch(
reader=paddle.reader.shuffle(
reader=reader.data_reader(
data_file=test_data_path,
src_dict=src_dict,
trg_dict=trg_dict,
pos_size=pos_size,
padding_num=padding_num),
buf_size=10240),
batch_size=batch_size)
return train_reader, test_reader
def train(train_data_path,
test_data_path,
src_dict_path,
trg_dict_path,
enc_conv_blocks,
dec_conv_blocks,
emb_dim=512,
pos_size=200,
drop_rate=0.,
batch_size=32,
num_passes=15):
"""
Train the convolution sequence-to-sequence model.
:param train_data_path: The path of the training set.
:type train_data_path: str
:param test_data_path: The path of the test set.
:type test_data_path: str
:param src_dict_path: The path of the source dictionary.
:type src_dict_path: str
:param trg_dict_path: The path of the target dictionary.
:type trg_dict_path: str
:param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of
the list contains output dimension and context length of the corresponding
convolution block.
:type enc_conv_blocks: list of tuple
:param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of
the list contains output dimension and context length of the corresponding
convolution block.
:type dec_conv_blocks: list of tuple
:param emb_dim: The dimension of the embedding vector.
:type emb_dim: int
:param pos_size: The total number of the position indexes, which means
the maximum value of the index is pos_size - 1.
:type pos_size: int
:param drop_rate: Dropout rate.
:type drop_rate: float
:param batch_size: The size of a mini-batch.
:type batch_size: int
:param num_passes: The total number of the passes to train.
:type num_passes: int
"""
# load dict
src_dict = reader.load_dict(src_dict_path)
trg_dict = reader.load_dict(trg_dict_path)
src_dict_size = src_dict.__len__()
trg_dict_size = trg_dict.__len__()
optimizer = paddle.optimizer.Adam(
learning_rate=1e-3, )
cost = conv_seq2seq(
src_dict_size=src_dict_size,
trg_dict_size=trg_dict_size,
pos_size=pos_size,
emb_dim=emb_dim,
enc_conv_blocks=enc_conv_blocks,
dec_conv_blocks=dec_conv_blocks,
drop_rate=drop_rate,
is_infer=False)
# create parameters and trainer
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks]
padding_num = reduce(lambda x, y: x + y, padding_list)
train_reader, test_reader = create_reader(
padding_num=padding_num,
train_data_path=train_data_path,
test_data_path=test_data_path,
src_dict=src_dict,
trg_dict=trg_dict,
pos_size=pos_size,
batch_size=batch_size)
feeding = {
'src_word': 0,
'src_word_pos': 1,
'trg_word': 2,
'trg_word_pos': 3,
'trg_next_word': 4
}
# create event handler
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 20 == 0:
cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime())
print "[%s]: Pass: %d, Batch: %d, TrainCost: %f, %s" % (
cur_time, event.pass_id, event.batch_id, event.cost,
event.metrics)
else:
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
if test_reader is not None:
cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime())
result = trainer.test(reader=test_reader, feeding=feeding)
print "[%s]: Pass: %d, TestCost: %f, %s" % (
cur_time, event.pass_id, result.cost, result.metrics)
sys.stdout.flush()
with gzip.open("output/params.pass-%d.tar.gz" % event.pass_id,
'w') as f:
trainer.save_parameter_to_tar(f)
if not os.path.exists('output'):
os.mkdir('output')
trainer.train(
reader=train_reader,
event_handler=event_handler,
num_passes=num_passes,
feeding=feeding)
def main():
args = parse_args()
enc_conv_blocks = eval(args.enc_blocks)
dec_conv_blocks = eval(args.dec_blocks)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
train(
train_data_path=args.train_data_path,
test_data_path=args.test_data_path,
src_dict_path=args.src_dict_path,
trg_dict_path=args.trg_dict_path,
enc_conv_blocks=enc_conv_blocks,
dec_conv_blocks=dec_conv_blocks,
emb_dim=args.emb_size,
pos_size=args.pos_size,
drop_rate=args.drop_rate,
batch_size=args.batch_size,
num_passes=args.num_passes)
if __name__ == '__main__':
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册