train.py 4.2 KB
Newer Older
H
hetianjian 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
import numpy as np
import os
from functools import partial
import logging
import paddle
import paddle.fluid as fluid
import argparse
import network
import reader

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)


def parse_args():
    parser = argparse.ArgumentParser("gnn")
    parser.add_argument(
        '--train_path', type=str, default='./data/diginetica/train.txt', help='dir of training data')
    parser.add_argument(
        '--config_path', type=str, default='./data/diginetica/config.txt', help='dir of config')
    parser.add_argument(
        '--model_path', type=str, default='./saved_model', help="path of model parameters")
    parser.add_argument(
        '--epoch_num', type=int, default=30, help='number of epochs to train for')
    parser.add_argument(
        '--batch_size', type=int, default=100, help='input batch size')
    parser.add_argument(
        '--hidden_size', type=int, default=100, help='hidden state size')
    parser.add_argument(
        '--l2', type=float, default=1e-5, help='l2 penalty')
    parser.add_argument(
        '--lr', type=float, default=0.001, help='learning rate')
    parser.add_argument(
        '--step', type=int, default=1, help='gnn propogation steps')
    parser.add_argument(
        '--lr_dc', type=float, default=0.1, help='learning rate decay rate')
    parser.add_argument(
        '--lr_dc_step', type=int, default=3, help='the number of steps after which the learning rate decay')
    parser.add_argument(
        '--use_cuda', type=int, default=0, help='whether to use gpu')
    parser.add_argument(
        '--use_parallel', type=int, default=1, help='whether to use parallel executor')
    return parser.parse_args()


def train():
    args = parse_args()
    batch_size = args.batch_size
    items_num = reader.read_config(args.config_path)
    loss, acc = network.network(batch_size, items_num, args.hidden_size,
                                args.step)

    data_reader = reader.Data(args.train_path, True)
    logger.info("load data complete")

    use_cuda = True if args.use_cuda else False
    use_parallel = True if args.use_parallel else False
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    exe = fluid.Executor(place)
    step_per_epoch = data_reader.length // batch_size
    optimizer = fluid.optimizer.Adam(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=args.lr,
            decay_steps=step_per_epoch * args.lr_dc_step,
            decay_rate=args.lr_dc),
        regularization=fluid.regularizer.L2DecayRegularizer(
            regularization_coeff=args.l2))
    optimizer.minimize(loss)

    exe.run(fluid.default_startup_program())

    all_vocab = fluid.global_scope().var("all_vocab").get_tensor()
    all_vocab.set(
        np.arange(1, items_num).astype("int64").reshape((-1, 1)), place)

    feed_list = [
        "items", "seq_index", "last_index", "adj_in", "adj_out", "mask", "label"
    ]
    feeder = fluid.DataFeeder(feed_list=feed_list, place=place)

    if use_parallel:
        train_exe = fluid.ParallelExecutor(
            use_cuda=use_cuda, loss_name=loss.name)
    else:
        train_exe = exe

    logger.info("begin train")

    loss_sum = 0.0
    acc_sum = 0.0
    global_step = 0
    PRINT_STEP = 500
    for i in range(args.epoch_num):
        epoch_sum = []
        for data in data_reader.reader(batch_size, batch_size * 20, True):
            res = train_exe.run(feed=feeder.feed(data),
                                fetch_list=[loss.name, acc.name])
            loss_sum += res[0]
            acc_sum += res[1]
            epoch_sum.append(res[0])
            global_step += 1
            if global_step % PRINT_STEP == 0:
                logger.info("global_step: %d, loss: %.4lf, train_acc: %.4lf" % (
                    global_step, loss_sum / PRINT_STEP, acc_sum / PRINT_STEP))
                loss_sum = 0.0
                acc_sum = 0.0
        logger.info("epoch loss: %.4lf" % (np.mean(epoch_sum)))
        save_dir = args.model_path + "/epoch_" + str(i)
        fetch_vars = [loss, acc]
        fluid.io.save_inference_model(save_dir, feed_list, fetch_vars, exe)
        logger.info("model saved in " + save_dir)


if __name__ == "__main__":
    train()