import os import sys import gzip import click import paddle.v2 as paddle import reader from network_conf import nest_net from utils import build_dict, load_dict, logger @click.command('train') @click.option( "--train_data_dir", default=None, help=("The path of training dataset (default: None). " "If this parameter is not set, " "imdb dataset will be used.")) @click.option( "--test_data_dir", default=None, help=("The path of testing dataset (default: None). " "If this parameter is not set, " "imdb dataset will be used.")) @click.option( "--word_dict_path", type=str, default=None, help=("The path of word dictionary (default: None). " "If this parameter is not set, imdb dataset will be used. " "If this parameter is set, but the file does not exist, " "word dictionay will be built from " "the training data automatically.")) @click.option( "--class_num", type=int, default=2, help="The class number (default: 2).") @click.option( "--batch_size", type=int, default=32, help=("The number of training examples in one batch " "(default: 32).")) @click.option( "--num_passes", type=int, default=10, help="The number of passes to train (default: 10).") @click.option( "--model_save_dir", type=str, default="models", help="The path to save the trained models (default: 'models').") def train(train_data_dir, test_data_dir, word_dict_path, class_num, model_save_dir, batch_size, num_passes): """ :params train_data_path: path of training data, if this parameter is not specified, imdb dataset will be used to run this example :type train_data_path: str :params test_data_path: path of testing data, if this parameter is not specified, imdb dataset will be used to run this example :type test_data_path: str :params word_dict_path: path of training data, if this parameter is not specified, imdb dataset will be used to run this example :type word_dict_path: str :params model_save_dir: dir where models saved :type num_pass: str :params batch_size: train batch size :type num_pass: int :params num_pass: train pass number :type num_pass: int """ if train_data_dir is not None: assert word_dict_path, ("The parameter train_data_dir, word_dict_path " "should be set at the same time.") if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) use_default_data = (train_data_dir is None) if use_default_data: logger.info(("No training data are porivided, " "use imdb to train the model.")) logger.info("Please wait to build the word dictionary ...") word_dict = reader.imdb_word_dict() train_reader = paddle.batch( paddle.reader.shuffle( lambda: reader.imdb_train(word_dict), buf_size=1000), batch_size=100) test_reader = paddle.batch( lambda: reader.imdb_test(word_dict), batch_size=100) class_num = 2 else: if word_dict_path is None or not os.path.exists(word_dict_path): logger.info(("Word dictionary is not given, the dictionary " "is automatically built from the training data.")) # build the word dictionary to map the original string-typed # words into integer-typed index build_dict( data_dir=train_data_dir, save_path=word_dict_path, use_col=1, cutoff_fre=0) word_dict = load_dict(word_dict_path) class_num = class_num logger.info("Class number is : %d." % class_num) train_reader = paddle.batch( paddle.reader.shuffle( reader.train_reader(train_data_dir, word_dict), buf_size=1000), batch_size=batch_size) if test_data_dir is not None: # here, because training and testing data share a same format, # we still use the reader.train_reader to read the testing data. test_reader = paddle.batch( paddle.reader.shuffle( reader.train_reader(test_data_dir, word_dict), buf_size=1000), batch_size=batch_size) else: test_reader = None dict_dim = len(word_dict) emb_size = 28 hidden_size = 128 logger.info("Length of word dictionary is : %d." % (dict_dim)) paddle.init(use_gpu=True, trainer_count=4) # network config cost, prob, label = nest_net( dict_dim, emb_size, hidden_size, class_num, is_infer=False) # create parameters parameters = paddle.parameters.create(cost) # create optimizer adam_optimizer = paddle.optimizer.Adam( learning_rate=1e-3, regularization=paddle.optimizer.L2Regularization(rate=1e-3), model_average=paddle.optimizer.ModelAverage(average_window=0.5)) # create trainer trainer = paddle.trainer.SGD( cost=cost, extra_layers=paddle.evaluator.auc(input=prob, label=label), parameters=parameters, update_equation=adam_optimizer) # begin training network feeding = {"word": 0, "label": 1} def _event_handler(event): """ Define end batch and end pass event handler """ if isinstance(event, paddle.event.EndIteration): if event.batch_id % 100 == 0: logger.info("Pass %d, Batch %d, Cost %f, %s\n" % ( event.pass_id, event.batch_id, event.cost, event.metrics)) if isinstance(event, paddle.event.EndPass): if test_reader is not None: result = trainer.test(reader=test_reader, feeding=feeding) logger.info("Test at Pass %d, %s \n" % (event.pass_id, result.metrics)) with gzip.open( os.path.join(model_save_dir, "params_pass_%05d.tar.gz" % event.pass_id), "w") as f: parameters.to_tar(f) trainer.train( reader=train_reader, event_handler=_event_handler, feeding=feeding, num_passes=num_passes) logger.info("Training has finished.") if __name__ == "__main__": train()