From 29bbd41ab44b0a721b614026f0a6e1f1a21c9bf0 Mon Sep 17 00:00:00 2001 From: ranqiu Date: Tue, 14 Nov 2017 14:43:12 +0800 Subject: [PATCH] delete redundant dir --- conv_seq_to_seq/README.md | 50 ---- conv_seq_to_seq/beamsearch.py | 163 ------------- conv_seq_to_seq/infer.py | 199 ---------------- conv_seq_to_seq/model.py | 417 ---------------------------------- conv_seq_to_seq/reader.py | 67 ------ conv_seq_to_seq/train.py | 252 -------------------- 6 files changed, 1148 deletions(-) delete mode 100644 conv_seq_to_seq/README.md delete mode 100644 conv_seq_to_seq/beamsearch.py delete mode 100644 conv_seq_to_seq/infer.py delete mode 100644 conv_seq_to_seq/model.py delete mode 100644 conv_seq_to_seq/reader.py delete mode 100644 conv_seq_to_seq/train.py diff --git a/conv_seq_to_seq/README.md b/conv_seq_to_seq/README.md deleted file mode 100644 index 817c464a..00000000 --- a/conv_seq_to_seq/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# Convolutional Sequence to Sequence Learning -This model implements the work in the following paper: - -Jonas Gehring, Micheal Auli, David Grangier, et al. Convolutional Sequence to Sequence Learning. Association for Computational Linguistics (ACL), 2017 - -# Training a Model -- Modify the following script if needed and then run: - - ```bash - python train.py \ - --train_data_path ./data/train_data \ - --test_data_path ./data/test_data \ - --src_dict_path ./data/src_dict \ - --trg_dict_path ./data/trg_dict \ - --enc_blocks "[(256, 3)] * 5" \ - --dec_blocks "[(256, 3)] * 3" \ - --emb_size 256 \ - --pos_size 200 \ - --drop_rate 0.1 \ - --use_gpu False \ - --trainer_count 1 \ - --batch_size 32 \ - --num_passes 20 \ - >train.log 2>&1 - ``` - -# Inferring by a Trained Model -- Infer by a trained model by running: - - ```bash - python infer.py \ - --infer_data_path ./data/infer_data \ - --src_dict_path ./data/src_dict \ - --trg_dict_path ./data/trg_dict \ - --enc_blocks "[(256, 3)] * 5" \ - --dec_blocks "[(256, 3)] * 3" \ - --emb_size 256 \ - --pos_size 200 \ - --drop_rate 0.1 \ - --use_gpu False \ - --trainer_count 1 \ - --max_len 100 \ - --beam_size 1 \ - --model_path ./params.pass-0.tar.gz \ - 1>infer_result 2>infer.log - ``` - -# Notes - -Currently, beam search will forward the encoder multiple times when predicting each target word, which requires extra computations. And we will fix it later. diff --git a/conv_seq_to_seq/beamsearch.py b/conv_seq_to_seq/beamsearch.py deleted file mode 100644 index 45656e80..00000000 --- a/conv_seq_to_seq/beamsearch.py +++ /dev/null @@ -1,163 +0,0 @@ -#coding=utf-8 - -import sys -import time -import numpy as np - - -class BeamSearch(object): - """ - Generate sequence by beam search - NOTE: this class only implements generating one sentence at a time. - """ - - def __init__(self, - inferer, - trg_dict, - pos_size, - padding_num, - beam_size=1, - max_len=100): - self.inferer = inferer - self.trg_dict = trg_dict - self.word_padding = trg_dict.__len__() - self.pos_size = pos_size - self.pos_padding = pos_size - self.padding_num = padding_num - self.win_len = padding_num + 1 - self.max_len = max_len - self.beam_size = beam_size - - def get_beam_input(self, pre_beam_list, infer_data): - """ - Get input for generation at the current iteration. - """ - beam_input = [] - - if len(pre_beam_list) == 0: - cur_trg = [self.word_padding - ] * self.padding_num + [self.trg_dict['']] - cur_trg_pos = [self.pos_padding] * self.padding_num + [0] - beam_input.append(infer_data + [cur_trg] + [cur_trg_pos]) - else: - for seq in pre_beam_list: - if len(seq) < self.win_len: - cur_trg = [self.word_padding] * ( - self.win_len - len(seq) - 1 - ) + [self.trg_dict['']] + seq - cur_trg_pos = [self.pos_padding] * ( - self.win_len - len(seq) - 1) + [0] + range(1, - len(seq) + 1) - else: - cur_trg = seq[-self.win_len:] - cur_trg_pos = range( - len(seq) + 1 - self.win_len, len(seq) + 1) - - beam_input.append(infer_data + [cur_trg] + [cur_trg_pos]) - return beam_input - - def get_prob(self, beam_input): - """ - Get the probabilities of all possible tokens. - """ - row_list = [j * self.win_len for j in range(len(beam_input))] - prob = self.inferer.infer(beam_input, field='value')[row_list, :] - return prob - - def get_candidate(self, pre_beam_list, pre_beam_score, prob): - """ - Get top beam_size tokens and their scores for each beam. - """ - if prob.ndim == 1: - candidate_id = prob.argsort()[-self.beam_size:][::-1] - candidate_log_prob = np.log(prob[candidate_id]) - else: - candidate_id = prob.argsort()[:, -self.beam_size:][:, ::-1] - candidate_log_prob = np.zeros_like(candidate_id).astype('float32') - for j in range(len(pre_beam_list)): - candidate_log_prob[j, :] = np.log(prob[j, candidate_id[j, :]]) - - if pre_beam_score.size > 0: - candidate_score = candidate_log_prob + pre_beam_score.reshape( - (pre_beam_score.size, 1)) - else: - candidate_score = candidate_log_prob - - return candidate_id, candidate_score - - def prune(self, candidate_id, candidate_score, pre_beam_list, - completed_seq_list, completed_seq_score, completed_seq_min_score): - """ - Pruning process of the beam search. During the process, beam_size most possible sequences - are selected for the beam in the next iteration. Besides, their scores and the minimum score - of the completed sequences are updated. - """ - candidate_id = candidate_id.flatten() - candidate_score = candidate_score.flatten() - - topk_idx = candidate_score.argsort()[-self.beam_size:][::-1].tolist() - topk_seq_idx = [idx / self.beam_size for idx in topk_idx] - - next_beam = [] - beam_score = [] - for j in range(len(topk_idx)): - if candidate_id[topk_idx[j]] == self.trg_dict['']: - if len( - completed_seq_list - ) < self.beam_size or completed_seq_min_score <= candidate_score[ - topk_idx[j]]: - completed_seq_list.append(pre_beam_list[topk_seq_idx[j]]) - completed_seq_score.append(candidate_score[topk_idx[j]]) - - if completed_seq_min_score is None or ( - completed_seq_min_score >= - candidate_score[topk_idx[j]] and - len(completed_seq_list) < self.beam_size): - completed_seq_min_score = candidate_score[topk_idx[j]] - else: - seq = pre_beam_list[topk_seq_idx[ - j]] + [candidate_id[topk_idx[j]]] - score = candidate_score[topk_idx[j]] - next_beam.append(seq) - beam_score.append(score) - - beam_score = np.array(beam_score) - return next_beam, beam_score, completed_seq_min_score - - def search_one_sample(self, infer_data): - """ - Beam search process for one sample. - """ - completed_seq_list = [] - completed_seq_score = [] - completed_seq_min_score = None - uncompleted_seq_list = [[]] - uncompleted_seq_score = np.zeros(0) - - for i in xrange(self.max_len): - beam_input = self.get_beam_input(uncompleted_seq_list, infer_data) - - prob = self.get_prob(beam_input) - - candidate_id, candidate_score = self.get_candidate( - uncompleted_seq_list, uncompleted_seq_score, prob) - - uncompleted_seq_list, uncompleted_seq_score, completed_seq_min_score = self.prune( - candidate_id, candidate_score, uncompleted_seq_list, - completed_seq_list, completed_seq_score, - completed_seq_min_score) - - if len(uncompleted_seq_list) == 0: - break - if len(completed_seq_list) >= self.beam_size: - seq_max_score = uncompleted_seq_score.max() - if seq_max_score < completed_seq_min_score: - uncompleted_seq_list = [] - break - - final_seq_list = completed_seq_list + uncompleted_seq_list - final_score = np.concatenate( - (np.array(completed_seq_score), uncompleted_seq_score)) - max_id = final_score.argmax() - top_seq = final_seq_list[max_id] - return top_seq diff --git a/conv_seq_to_seq/infer.py b/conv_seq_to_seq/infer.py deleted file mode 100644 index eb46df55..00000000 --- a/conv_seq_to_seq/infer.py +++ /dev/null @@ -1,199 +0,0 @@ -#coding=utf-8 - -import sys -import argparse -import distutils.util -import gzip - -import paddle.v2 as paddle -from model import conv_seq2seq -from beamsearch import BeamSearch -import reader - - -def parse_args(): - parser = argparse.ArgumentParser( - description="PaddlePaddle Convolutional Seq2Seq") - parser.add_argument( - '--infer_data_path', - type=str, - required=True, - help="Path of the dataset for inference") - parser.add_argument( - '--src_dict_path', - type=str, - required=True, - help='Path of the source dictionary') - parser.add_argument( - '--trg_dict_path', - type=str, - required=True, - help='path of the target dictionary') - parser.add_argument( - '--enc_blocks', type=str, help='Convolution blocks of the encoder') - parser.add_argument( - '--dec_blocks', type=str, help='Convolution blocks of the decoder') - parser.add_argument( - '--emb_size', - type=int, - default=512, - help='Dimension of word embedding. (default: %(default)s)') - parser.add_argument( - '--pos_size', - type=int, - default=200, - help='Total number of the position indexes. (default: %(default)s)') - parser.add_argument( - '--drop_rate', - type=float, - default=0., - help='Dropout rate. (default: %(default)s)') - parser.add_argument( - "--use_gpu", - default=False, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") - parser.add_argument( - "--trainer_count", - default=1, - type=int, - help="Trainer number. (default: %(default)s)") - parser.add_argument( - '--max_len', - type=int, - default=100, - help="The maximum length of the sentence to be generated. (default: %(default)s)" - ) - parser.add_argument( - "--beam_size", - default=1, - type=int, - help="The width of beam expasion. (default: %(default)s)") - parser.add_argument( - "--model_path", - type=str, - required=True, - help="The path of trained model. (default: %(default)s)") - return parser.parse_args() - - -def to_sentence(seq, dictionary): - raw_sentence = [dictionary[id] for id in seq] - sentence = " ".join(raw_sentence) - return sentence - - -def infer(infer_data_path, - src_dict_path, - trg_dict_path, - model_path, - enc_conv_blocks, - dec_conv_blocks, - emb_dim=512, - pos_size=200, - drop_rate=0., - max_len=100, - beam_size=1): - """ - Inference. - - :param infer_data_path: The path of the data for inference. - :type infer_data_path: str - :param src_dict_path: The path of the source dictionary. - :type src_dict_path: str - :param trg_dict_path: The path of the target dictionary. - :type trg_dict_path: str - :param model_path: The path of a trained model. - :type model_path: str - :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of - the list contains output dimension and context length of the corresponding - convolution block. - :type enc_conv_blocks: list of tuple - :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of - the list contains output dimension and context length of the corresponding - convolution block. - :type dec_conv_blocks: list of tuple - :param emb_dim: The dimension of the embedding vector. - :type emb_dim: int - :param pos_size: The total number of the position indexes, which means - the maximum value of the index is pos_size - 1. - :type pos_size: int - :param drop_rate: Dropout rate. - :type drop_rate: float - :param max_len: The maximum length of the sentence to be generated. - :type max_len: int - :param beam_size: The width of beam expansion. - :type beam_size: int - """ - # load dict - src_dict = reader.load_dict(src_dict_path) - trg_dict = reader.load_dict(trg_dict_path) - src_dict_size = src_dict.__len__() - trg_dict_size = trg_dict.__len__() - - prob = conv_seq2seq( - src_dict_size=src_dict_size, - trg_dict_size=trg_dict_size, - pos_size=pos_size, - emb_dim=emb_dim, - enc_conv_blocks=enc_conv_blocks, - dec_conv_blocks=dec_conv_blocks, - drop_rate=drop_rate, - is_infer=True) - - # load parameters - parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path)) - - padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks] - padding_num = reduce(lambda x, y: x + y, padding_list) - infer_reader = reader.data_reader( - data_file=infer_data_path, - src_dict=src_dict, - trg_dict=trg_dict, - pos_size=pos_size, - padding_num=padding_num) - - inferer = paddle.inference.Inference( - output_layer=prob, parameters=parameters) - - searcher = BeamSearch( - inferer=inferer, - trg_dict=trg_dict, - pos_size=pos_size, - padding_num=padding_num, - max_len=max_len, - beam_size=beam_size) - - reverse_trg_dict = reader.get_reverse_dict(trg_dict) - for i, raw_data in enumerate(infer_reader()): - infer_data = [raw_data[0], raw_data[1]] - result = searcher.search_one_sample(infer_data) - sentence = to_sentence(result, reverse_trg_dict) - print sentence - sys.stdout.flush() - return - - -def main(): - args = parse_args() - enc_conv_blocks = eval(args.enc_blocks) - dec_conv_blocks = eval(args.dec_blocks) - - paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) - - infer( - infer_data_path=args.infer_data_path, - src_dict_path=args.src_dict_path, - trg_dict_path=args.trg_dict_path, - model_path=args.model_path, - enc_conv_blocks=enc_conv_blocks, - dec_conv_blocks=dec_conv_blocks, - emb_dim=args.emb_size, - pos_size=args.pos_size, - drop_rate=args.drop_rate, - max_len=args.max_len, - beam_size=args.beam_size) - - -if __name__ == '__main__': - main() diff --git a/conv_seq_to_seq/model.py b/conv_seq_to_seq/model.py deleted file mode 100644 index 01dd9428..00000000 --- a/conv_seq_to_seq/model.py +++ /dev/null @@ -1,417 +0,0 @@ -#coding=utf-8 - -import math - -import paddle.v2 as paddle - -__all__ = ["conv_seq2seq"] - - -def gated_conv_with_batchnorm(input, - size, - context_len, - context_start=None, - learning_rate=1.0, - drop_rate=0.): - """ - Definition of the convolution block. - - :param input: The input of this block. - :type input: LayerOutput - :param size: The dimension of the block's output. - :type size: int - :param context_len: The context length of the convolution. - :type context_len: int - :param context_start: The start position of the context. - :type context_start: int - :param learning_rate: The learning rate factor of the parameters in the block. - The actual learning rate is the product of the global - learning rate and this factor. - :type learning_rate: float - :param drop_rate: Dropout rate. - :type drop_rate: float - :return: The output of the convolution block. - :rtype: LayerOutput - """ - input = paddle.layer.dropout(input=input, dropout_rate=drop_rate) - - context = paddle.layer.mixed( - size=input.size * context_len, - input=paddle.layer.context_projection( - input=input, context_len=context_len, context_start=context_start)) - - raw_conv = paddle.layer.fc( - input=context, - size=size * 2, - act=paddle.activation.Linear(), - param_attr=paddle.attr.Param( - initial_mean=0., - initial_std=math.sqrt(4.0 * (1.0 - drop_rate) / context.size), - learning_rate=learning_rate), - bias_attr=False) - - batch_norm_conv = paddle.layer.batch_norm( - input=raw_conv, - act=paddle.activation.Linear(), - param_attr=paddle.attr.Param(learning_rate=learning_rate)) - - with paddle.layer.mixed(size=size) as conv: - conv += paddle.layer.identity_projection( - batch_norm_conv, size=size, offset=0) - - with paddle.layer.mixed(size=size, act=paddle.activation.Sigmoid()) as gate: - gate += paddle.layer.identity_projection( - batch_norm_conv, size=size, offset=size) - - with paddle.layer.mixed(size=size) as gated_conv: - gated_conv += paddle.layer.dotmul_operator(conv, gate) - - return gated_conv - - -def encoder(token_emb, - pos_emb, - conv_blocks=[(256, 3)] * 5, - num_attention=3, - drop_rate=0.1): - """ - Definition of the encoder. - - :param token_emb: The embedding vector of the input token. - :type token_emb: LayerOutput - :param pos_emb: The embedding vector of the input token's position. - :type pos_emb: LayerOutput - :param conv_blocks: The scale list of the convolution blocks. Each element of - the list contains output dimension and context length of - the corresponding convolution block. - :type conv_blocks: list of tuple - :param num_attention: The total number of the attention modules used in the decoder. - :type num_attention: int - :param drop_rate: Dropout rate. - :type drop_rate: float - :return: The input token encoding. - :rtype: LayerOutput - """ - embedding = paddle.layer.addto( - input=[token_emb, pos_emb], - layer_attr=paddle.attr.Extra(drop_rate=drop_rate)) - - proj_size = conv_blocks[0][0] - block_input = paddle.layer.fc( - input=embedding, - size=proj_size, - act=paddle.activation.Linear(), - param_attr=paddle.attr.Param( - initial_mean=0., - initial_std=math.sqrt((1.0 - drop_rate) / embedding.size), - learning_rate=1.0 / (2.0 * num_attention)), - bias_attr=True, ) - - for (size, context_len) in conv_blocks: - if block_input.size == size: - residual = block_input - else: - residual = paddle.layer.fc( - input=block_input, - size=size, - act=paddle.activation.Linear(), - param_attr=paddle.attr.Param(learning_rate=1.0 / - (2.0 * num_attention)), - bias_attr=True) - - gated_conv = gated_conv_with_batchnorm( - input=block_input, - size=size, - context_len=context_len, - learning_rate=1.0 / (2.0 * num_attention), - drop_rate=drop_rate) - - with paddle.layer.mixed(size=size) as block_output: - block_output += paddle.layer.identity_projection(residual) - block_output += paddle.layer.identity_projection(gated_conv) - - # halve the variance of the sum - block_output = paddle.layer.slope_intercept( - input=block_output, slope=math.sqrt(0.5)) - - block_input = block_output - - emb_dim = embedding.size - encoded_vec = paddle.layer.fc( - input=block_output, - size=emb_dim, - act=paddle.activation.Linear(), - param_attr=paddle.attr.Param(learning_rate=1.0 / (2.0 * num_attention)), - bias_attr=True) - - encoded_sum = paddle.layer.addto(input=[encoded_vec, embedding]) - - # halve the variance of the sum - encoded_sum = paddle.layer.slope_intercept(input=encoded_sum, slope=math.sqrt(0.5)) - - return encoded_vec, encoded_sum - - -def attention(decoder_state, cur_embedding, encoded_vec, encoded_sum): - """ - Definition of the attention. - - :param decoder_state: The hidden state of the decoder. - :type decoder_state: LayerOutput - :param cur_embedding: The embedding vector of the current token. - :type cur_embedding: LayerOutput - :param encoded_vec: The source token encoding. - :type encoded_vec: LayerOutput - :param encoded_sum: The sum of the source token's encoding and embedding. - :type encoded_sum: LayerOutput - :return: A context vector. - :rtype: LayerOutput - """ - residual = decoder_state - - state_size = decoder_state.size - emb_dim = cur_embedding.size - with paddle.layer.mixed(size=emb_dim, bias_attr=True) as state_summary: - state_summary += paddle.layer.full_matrix_projection(decoder_state) - state_summary += paddle.layer.identity_projection(cur_embedding) - - # halve the variance of the sum - state_summary = paddle.layer.slope_intercept( - input=state_summary, slope=math.sqrt(0.5)) - - expanded = paddle.layer.expand(input=state_summary, expand_as=encoded_vec) - - m = paddle.layer.linear_comb(weights=expanded, vectors=encoded_vec) - - attention_weight = paddle.layer.fc( - input=m, - size=1, - act=paddle.activation.SequenceSoftmax(), - bias_attr=False) - - scaled = paddle.layer.scaling(weight=attention_weight, input=encoded_sum) - - attended = paddle.layer.pooling( - input=scaled, pooling_type=paddle.pooling.Sum()) - - attended_proj = paddle.layer.fc( - input=attended, - size=state_size, - act=paddle.activation.Linear(), - bias_attr=True) - - attention_result = paddle.layer.addto(input=[attended_proj, residual]) - - # halve the variance of the sum - attention_result = paddle.layer.slope_intercept( - input=attention_result, slope=math.sqrt(0.5)) - return attention_result - - -def decoder(token_emb, - pos_emb, - encoded_vec, - encoded_sum, - dict_size, - conv_blocks=[(256, 3)] * 3, - drop_rate=0.1): - """ - Definition of the decoder. - - :param token_emb: The embedding vector of the input token. - :type token_emb: LayerOutput - :param pos_emb: The embedding vector of the input token's position. - :type pos_emb: LayerOutput - :param encoded_vec: The source token encoding. - :type encoded_vec: LayerOutput - :param encoded_sum: The sum of the source token's encoding and embedding. - :type encoded_sum: LayerOutput - :param dict_size: The size of the target dictionary. - :type dict_size: int - :param conv_blocks: The scale list of the convolution blocks. Each element - of the list contains output dimension and context length - of the corresponding convolution block. - :type conv_blocks: list of tuple - :param drop_rate: Dropout rate. - :type drop_rate: float - :return: The probability of the predicted token. - :rtype: LayerOutput - """ - - def attention_step(decoder_state, cur_embedding, encoded_vec, encoded_sum): - conditional = attention( - decoder_state=decoder_state, - cur_embedding=cur_embedding, - encoded_vec=encoded_vec, - encoded_sum=encoded_sum) - return conditional - - embedding = paddle.layer.addto( - input=[token_emb, pos_emb], - layer_attr=paddle.attr.Extra(drop_rate=drop_rate)) - - proj_size = conv_blocks[0][0] - block_input = paddle.layer.fc( - input=embedding, - size=proj_size, - act=paddle.activation.Linear(), - param_attr=paddle.attr.Param( - initial_mean=0., - initial_std=math.sqrt((1.0 - drop_rate) / embedding.size)), - bias_attr=True, ) - - for (size, context_len) in conv_blocks: - if block_input.size == size: - residual = block_input - else: - residual = paddle.layer.fc( - input=block_input, - size=size, - act=paddle.activation.Linear(), - bias_attr=True) - - decoder_state = gated_conv_with_batchnorm( - input=block_input, - size=size, - context_len=context_len, - context_start=0, - drop_rate=drop_rate) - - group_inputs = [ - decoder_state, - embedding, - paddle.layer.StaticInput(input=encoded_vec), - paddle.layer.StaticInput(input=encoded_sum), - ] - - conditional = paddle.layer.recurrent_group( - step=attention_step, input=group_inputs) - - block_output = paddle.layer.addto(input=[conditional, residual]) - - # halve the variance of the sum - block_output = paddle.layer.slope_intercept( - input=block_output, slope=math.sqrt(0.5)) - - block_input = block_output - - out_emb_dim = embedding.size - block_output = paddle.layer.fc( - input=block_output, - size=out_emb_dim, - act=paddle.activation.Linear(), - layer_attr=paddle.attr.Extra(drop_rate=drop_rate)) - - decoder_out = paddle.layer.fc( - input=block_output, - size=dict_size, - act=paddle.activation.Softmax(), - param_attr=paddle.attr.Param( - initial_mean=0., - initial_std=math.sqrt((1.0 - drop_rate) / block_output.size)), - bias_attr=True) - - return decoder_out - - -def conv_seq2seq(src_dict_size, - trg_dict_size, - pos_size, - emb_dim, - enc_conv_blocks=[(256, 3)] * 5, - dec_conv_blocks=[(256, 3)] * 3, - drop_rate=0.1, - is_infer=False): - """ - Definition of convolutional sequence-to-sequence network. - - :param src_dict_size: The size of the source dictionary. - :type src_dict_size: int - :param trg_dict_size: The size of the target dictionary. - :type trg_dict_size: int - :param pos_size: The total number of the position indexes, which means - the maximum value of the index is pos_size - 1. - :type pos_size: int - :param emb_dim: The dimension of the embedding vector. - :type emb_dim: int - :param enc_conv_blocks: The scale list of the encoder's convolution blocks. Each element - of the list contains output dimension and context length of the - corresponding convolution block. - :type enc_conv_blocks: list of tuple - :param dec_conv_blocks: The scale list of the decoder's convolution blocks. Each element - of the list contains output dimension and context length of the - corresponding convolution block. - :type dec_conv_blocks: list of tuple - :param drop_rate: Dropout rate. - :type drop_rate: float - :param is_infer: Whether infer or not. - :type is_infer: bool - :return: Cost or output layer. - :rtype: LayerOutput - """ - src = paddle.layer.data( - name='src_word', - type=paddle.data_type.integer_value_sequence(src_dict_size)) - src_pos = paddle.layer.data( - name='src_word_pos', - type=paddle.data_type.integer_value_sequence(pos_size + - 1)) # one for padding - - src_emb = paddle.layer.embedding( - input=src, - size=emb_dim, - name='src_word_emb', - param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1)) - src_pos_emb = paddle.layer.embedding( - input=src_pos, - size=emb_dim, - name='src_pos_emb', - param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1)) - - num_attention = len(dec_conv_blocks) - encoded_vec, encoded_sum = encoder( - token_emb=src_emb, - pos_emb=src_pos_emb, - conv_blocks=enc_conv_blocks, - num_attention=num_attention, - drop_rate=drop_rate) - - trg = paddle.layer.data( - name='trg_word', - type=paddle.data_type.integer_value_sequence(trg_dict_size + - 1)) # one for padding - trg_pos = paddle.layer.data( - name='trg_word_pos', - type=paddle.data_type.integer_value_sequence(pos_size + - 1)) # one for padding - - trg_emb = paddle.layer.embedding( - input=trg, - size=emb_dim, - name='trg_word_emb', - param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1)) - trg_pos_emb = paddle.layer.embedding( - input=trg_pos, - size=emb_dim, - name='trg_pos_emb', - param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1)) - - decoder_out = decoder( - token_emb=trg_emb, - pos_emb=trg_pos_emb, - encoded_vec=encoded_vec, - encoded_sum=encoded_sum, - dict_size=trg_dict_size, - conv_blocks=dec_conv_blocks, - drop_rate=drop_rate) - - if is_infer: - return decoder_out - - trg_next_word = paddle.layer.data( - name='trg_next_word', - type=paddle.data_type.integer_value_sequence(trg_dict_size)) - cost = paddle.layer.classification_cost( - input=decoder_out, label=trg_next_word) - - return cost diff --git a/conv_seq_to_seq/reader.py b/conv_seq_to_seq/reader.py deleted file mode 100644 index 6d4db49f..00000000 --- a/conv_seq_to_seq/reader.py +++ /dev/null @@ -1,67 +0,0 @@ -#coding=utf-8 - -import random - - -def load_dict(dict_file): - word_dict = dict() - with open(dict_file, 'r') as f: - for i, line in enumerate(f): - w = line.strip().split()[0] - word_dict[w] = i - return word_dict - - -def get_reverse_dict(dictionary): - reverse_dict = {dictionary[k]: k for k in dictionary.keys()} - return reverse_dict - - -def load_data(data_file, src_dict, trg_dict): - UNK_IDX = src_dict[''] - with open(data_file, 'r') as f: - for line in f: - line_split = line.strip().split('\t') - if len(line_split) < 2: - continue - src, trg = line_split - src_words = src.strip().split() - trg_words = trg.strip().split() - src_seq = [src_dict.get(w, UNK_IDX) for w in src_words] - trg_seq = [trg_dict.get(w, UNK_IDX) for w in trg_words] - yield src_seq, trg_seq - - -def data_reader(data_file, src_dict, trg_dict, pos_size, padding_num): - def reader(): - UNK_IDX = src_dict[''] - word_padding = trg_dict.__len__() - pos_padding = pos_size - - def _get_pos(pos_list, pos_size, pos_padding): - return [pos if pos < pos_size else pos_padding for pos in pos_list] - - with open(data_file, 'r') as f: - for line in f: - line_split = line.strip().split('\t') - if len(line_split) != 2: - continue - src, trg = line_split - src = src.strip().split() - src_word = [src_dict.get(w, UNK_IDX) for w in src] - src_word_pos = range(len(src_word)) - src_word_pos = _get_pos(src_word_pos, pos_size, pos_padding) - - trg = trg.strip().split() - trg_word = [trg_dict[''] - ] + [trg_dict.get(w, UNK_IDX) for w in trg] - trg_word_pos = range(len(trg_word)) - trg_word_pos = _get_pos(trg_word_pos, pos_size, pos_padding) - - trg_next_word = trg_word[1:] + [trg_dict['']] - trg_word = [word_padding] * padding_num + trg_word - trg_word_pos = [pos_padding] * padding_num + trg_word_pos - trg_next_word = trg_next_word + [trg_dict['']] * padding_num - yield src_word, src_word_pos, trg_word, trg_word_pos, trg_next_word - - return reader diff --git a/conv_seq_to_seq/train.py b/conv_seq_to_seq/train.py deleted file mode 100644 index c6ce0dff..00000000 --- a/conv_seq_to_seq/train.py +++ /dev/null @@ -1,252 +0,0 @@ -#coding=utf-8 - -import os -import sys -import time -import argparse -import distutils.util -import gzip -import numpy as np - -import paddle.v2 as paddle -from model import conv_seq2seq -import reader - - -def parse_args(): - parser = argparse.ArgumentParser( - description="PaddlePaddle Convolutional Seq2Seq") - parser.add_argument( - '--train_data_path', - type=str, - required=True, - help="Path of the training set") - parser.add_argument( - '--test_data_path', type=str, help='Path of the test set') - parser.add_argument( - '--src_dict_path', - type=str, - required=True, - help='Path of source dictionary') - parser.add_argument( - '--trg_dict_path', - type=str, - required=True, - help='Path of target dictionary') - parser.add_argument( - '--enc_blocks', type=str, help='Convolution blocks of the encoder') - parser.add_argument( - '--dec_blocks', type=str, help='Convolution blocks of the decoder') - parser.add_argument( - '--emb_size', - type=int, - default=512, - help='Dimension of word embedding. (default: %(default)s)') - parser.add_argument( - '--pos_size', - type=int, - default=200, - help='Total number of the position indexes. (default: %(default)s)') - parser.add_argument( - '--drop_rate', - type=float, - default=0., - help='Dropout rate. (default: %(default)s)') - parser.add_argument( - "--use_gpu", - default=False, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") - parser.add_argument( - "--trainer_count", - default=1, - type=int, - help="Trainer number. (default: %(default)s)") - parser.add_argument( - '--batch_size', - type=int, - default=32, - help="Size of a mini-batch. (default: %(default)s)") - parser.add_argument( - '--num_passes', - type=int, - default=15, - help="Number of passes to train. (default: %(default)s)") - return parser.parse_args() - - -def create_reader(padding_num, - train_data_path, - test_data_path=None, - src_dict=None, - trg_dict=None, - pos_size=200, - batch_size=32): - - train_reader = paddle.batch( - reader=paddle.reader.shuffle( - reader=reader.data_reader( - data_file=train_data_path, - src_dict=src_dict, - trg_dict=trg_dict, - pos_size=pos_size, - padding_num=padding_num), - buf_size=10240), - batch_size=batch_size) - - test_reader = None - if test_data_path: - test_reader = paddle.batch( - reader=paddle.reader.shuffle( - reader=reader.data_reader( - data_file=test_data_path, - src_dict=src_dict, - trg_dict=trg_dict, - pos_size=pos_size, - padding_num=padding_num), - buf_size=10240), - batch_size=batch_size) - - return train_reader, test_reader - - -def train(train_data_path, - test_data_path, - src_dict_path, - trg_dict_path, - enc_conv_blocks, - dec_conv_blocks, - emb_dim=512, - pos_size=200, - drop_rate=0., - batch_size=32, - num_passes=15): - """ - Train the convolution sequence-to-sequence model. - - :param train_data_path: The path of the training set. - :type train_data_path: str - :param test_data_path: The path of the test set. - :type test_data_path: str - :param src_dict_path: The path of the source dictionary. - :type src_dict_path: str - :param trg_dict_path: The path of the target dictionary. - :type trg_dict_path: str - :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of - the list contains output dimension and context length of the corresponding - convolution block. - :type enc_conv_blocks: list of tuple - :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of - the list contains output dimension and context length of the corresponding - convolution block. - :type dec_conv_blocks: list of tuple - :param emb_dim: The dimension of the embedding vector. - :type emb_dim: int - :param pos_size: The total number of the position indexes, which means - the maximum value of the index is pos_size - 1. - :type pos_size: int - :param drop_rate: Dropout rate. - :type drop_rate: float - :param batch_size: The size of a mini-batch. - :type batch_size: int - :param num_passes: The total number of the passes to train. - :type num_passes: int - """ - # load dict - src_dict = reader.load_dict(src_dict_path) - trg_dict = reader.load_dict(trg_dict_path) - src_dict_size = src_dict.__len__() - trg_dict_size = trg_dict.__len__() - - optimizer = paddle.optimizer.Adam( - learning_rate=1e-3, ) - - cost = conv_seq2seq( - src_dict_size=src_dict_size, - trg_dict_size=trg_dict_size, - pos_size=pos_size, - emb_dim=emb_dim, - enc_conv_blocks=enc_conv_blocks, - dec_conv_blocks=dec_conv_blocks, - drop_rate=drop_rate, - is_infer=False) - - # create parameters and trainer - parameters = paddle.parameters.create(cost) - trainer = paddle.trainer.SGD( - cost=cost, parameters=parameters, update_equation=optimizer) - - padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks] - padding_num = reduce(lambda x, y: x + y, padding_list) - train_reader, test_reader = create_reader( - padding_num=padding_num, - train_data_path=train_data_path, - test_data_path=test_data_path, - src_dict=src_dict, - trg_dict=trg_dict, - pos_size=pos_size, - batch_size=batch_size) - - feeding = { - 'src_word': 0, - 'src_word_pos': 1, - 'trg_word': 2, - 'trg_word_pos': 3, - 'trg_next_word': 4 - } - - # create event handler - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 20 == 0: - cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime()) - print "[%s]: Pass: %d, Batch: %d, TrainCost: %f, %s" % ( - cur_time, event.pass_id, event.batch_id, event.cost, - event.metrics) - else: - sys.stdout.flush() - - if isinstance(event, paddle.event.EndPass): - if test_reader is not None: - cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime()) - result = trainer.test(reader=test_reader, feeding=feeding) - print "[%s]: Pass: %d, TestCost: %f, %s" % ( - cur_time, event.pass_id, result.cost, result.metrics) - sys.stdout.flush() - with gzip.open("output/params.pass-%d.tar.gz" % event.pass_id, - 'w') as f: - trainer.save_parameter_to_tar(f) - - if not os.path.exists('output'): - os.mkdir('output') - - trainer.train( - reader=train_reader, - event_handler=event_handler, - num_passes=num_passes, - feeding=feeding) - - -def main(): - args = parse_args() - enc_conv_blocks = eval(args.enc_blocks) - dec_conv_blocks = eval(args.dec_blocks) - - paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) - - train( - train_data_path=args.train_data_path, - test_data_path=args.test_data_path, - src_dict_path=args.src_dict_path, - trg_dict_path=args.trg_dict_path, - enc_conv_blocks=enc_conv_blocks, - dec_conv_blocks=dec_conv_blocks, - emb_dim=args.emb_size, - pos_size=args.pos_size, - drop_rate=args.drop_rate, - batch_size=args.batch_size, - num_passes=args.num_passes) - - -if __name__ == '__main__': - main() -- GitLab