#coding=utf-8 import os import sys import time import argparse import distutils.util import gzip import numpy as np import paddle.v2 as paddle from model import conv_seq2seq import reader def parse_args(): parser = argparse.ArgumentParser( description="PaddlePaddle Convolutional Seq2Seq") parser.add_argument( '--train_data_path', type=str, required=True, help="Path of the training set") parser.add_argument( '--test_data_path', type=str, help='Path of the test set') parser.add_argument( '--src_dict_path', type=str, required=True, help='Path of source dictionary') parser.add_argument( '--trg_dict_path', type=str, required=True, help='Path of target dictionary') parser.add_argument( '--enc_blocks', type=str, help='Convolution blocks of the encoder') parser.add_argument( '--dec_blocks', type=str, help='Convolution blocks of the decoder') parser.add_argument( '--emb_size', type=int, default=256, help='Dimension of word embedding. (default: %(default)s)') parser.add_argument( '--pos_size', type=int, default=200, help='Total number of the position indexes. (default: %(default)s)') parser.add_argument( '--drop_rate', type=float, default=0., help='Dropout rate. (default: %(default)s)') parser.add_argument( "--use_bn", default=False, type=distutils.util.strtobool, help="Use batch normalization or not. (default: %(default)s)") parser.add_argument( "--use_gpu", default=False, type=distutils.util.strtobool, help="Use gpu or not. (default: %(default)s)") parser.add_argument( "--trainer_count", default=1, type=int, help="Trainer number. (default: %(default)s)") parser.add_argument( '--batch_size', type=int, default=32, help="Size of a mini-batch. (default: %(default)s)") parser.add_argument( '--num_passes', type=int, default=15, help="Number of passes to train. (default: %(default)s)") return parser.parse_args() def create_reader(padding_num, train_data_path, test_data_path=None, src_dict=None, trg_dict=None, pos_size=200, batch_size=32): train_reader = paddle.batch( reader=paddle.reader.shuffle( reader=reader.data_reader( data_file=train_data_path, src_dict=src_dict, trg_dict=trg_dict, pos_size=pos_size, padding_num=padding_num), buf_size=10240), batch_size=batch_size) test_reader = None if test_data_path: test_reader = paddle.batch( reader=paddle.reader.shuffle( reader=reader.data_reader( data_file=test_data_path, src_dict=src_dict, trg_dict=trg_dict, pos_size=pos_size, padding_num=padding_num), buf_size=10240), batch_size=batch_size) return train_reader, test_reader def train(train_data_path, test_data_path, src_dict_path, trg_dict_path, enc_conv_blocks, dec_conv_blocks, emb_dim=256, pos_size=200, drop_rate=0., use_bn=False, batch_size=32, num_passes=15): """ Train the convolution sequence-to-sequence model. :param train_data_path: The path of the training set. :type train_data_path: str :param test_data_path: The path of the test set. :type test_data_path: str :param src_dict_path: The path of the source dictionary. :type src_dict_path: str :param trg_dict_path: The path of the target dictionary. :type trg_dict_path: str :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of the list contains output dimension and context length of the corresponding convolution block. :type enc_conv_blocks: list of tuple :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of the list contains output dimension and context length of the corresponding convolution block. :type dec_conv_blocks: list of tuple :param emb_dim: The dimension of the embedding vector. :type emb_dim: int :param pos_size: The total number of the position indexes, which means the maximum value of the index is pos_size - 1. :type pos_size: int :param drop_rate: Dropout rate. :type drop_rate: float :param use_bn: Whether to use batch normalization or not. False is the default value. :type use_bn: bool :param batch_size: The size of a mini-batch. :type batch_size: int :param num_passes: The total number of the passes to train. :type num_passes: int """ # load dict src_dict = reader.load_dict(src_dict_path) trg_dict = reader.load_dict(trg_dict_path) src_dict_size = src_dict.__len__() trg_dict_size = trg_dict.__len__() optimizer = paddle.optimizer.Adam(learning_rate=1e-3, ) cost = conv_seq2seq( src_dict_size=src_dict_size, trg_dict_size=trg_dict_size, pos_size=pos_size, emb_dim=emb_dim, enc_conv_blocks=enc_conv_blocks, dec_conv_blocks=dec_conv_blocks, drop_rate=drop_rate, with_bn=use_bn, is_infer=False) # create parameters and trainer parameters = paddle.parameters.create(cost) trainer = paddle.trainer.SGD(cost=cost, parameters=parameters, update_equation=optimizer) padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks] padding_num = reduce(lambda x, y: x + y, padding_list) train_reader, test_reader = create_reader( padding_num=padding_num, train_data_path=train_data_path, test_data_path=test_data_path, src_dict=src_dict, trg_dict=trg_dict, pos_size=pos_size, batch_size=batch_size) feeding = { 'src_word': 0, 'src_word_pos': 1, 'trg_word': 2, 'trg_word_pos': 3, 'trg_next_word': 4 } # create event handler def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 20 == 0: cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime()) print "[%s]: Pass: %d, Batch: %d, TrainCost: %f, %s" % ( cur_time, event.pass_id, event.batch_id, event.cost, event.metrics) sys.stdout.flush() if isinstance(event, paddle.event.EndPass): if test_reader is not None: cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime()) result = trainer.test(reader=test_reader, feeding=feeding) print "[%s]: Pass: %d, TestCost: %f, %s" % ( cur_time, event.pass_id, result.cost, result.metrics) sys.stdout.flush() with gzip.open("output/params.pass-%d.tar.gz" % event.pass_id, 'w') as f: trainer.save_parameter_to_tar(f) if not os.path.exists('output'): os.mkdir('output') trainer.train( reader=train_reader, event_handler=event_handler, num_passes=num_passes, feeding=feeding) def main(): args = parse_args() enc_conv_blocks = eval(args.enc_blocks) dec_conv_blocks = eval(args.dec_blocks) sys.setrecursionlimit(10000) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) train( train_data_path=args.train_data_path, test_data_path=args.test_data_path, src_dict_path=args.src_dict_path, trg_dict_path=args.trg_dict_path, enc_conv_blocks=enc_conv_blocks, dec_conv_blocks=dec_conv_blocks, emb_dim=args.emb_size, pos_size=args.pos_size, drop_rate=args.drop_rate, use_bn=args.use_bn, batch_size=args.batch_size, num_passes=args.num_passes) if __name__ == '__main__': main()