train.py 5.6 KB
Newer Older
F
frankwhzhang 已提交
1 2 3 4 5 6 7 8 9 10 11
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import paddle.fluid as fluid
import paddle
import time
import utils
F
frankwhzhang 已提交
12
import net
F
8.3  
frankwhzhang 已提交
13

F
frankwhzhang 已提交
14 15
SEED = 102

F
8.3  
frankwhzhang 已提交
16

F
frankwhzhang 已提交
17 18 19
def parse_args():
    parser = argparse.ArgumentParser("gru4rec benchmark.")
    parser.add_argument(
Z
zhangwenhui03 已提交
20 21 22 23 24
        '--train_dir', type=str, default='train_data', help='train file')
    parser.add_argument(
        '--vocab_path', type=str, default='vocab.txt', help='vocab file')
    parser.add_argument(
        '--is_local', type=int, default=1, help='whether is local')
Z
zhangwenhui03 已提交
25 26
    parser.add_argument(
        '--hid_size', type=int, default=100, help='hidden-dim size')
F
frankwhzhang 已提交
27 28 29 30 31 32
    parser.add_argument(
        '--model_dir', type=str, default='model_recall20', help='model dir')
    parser.add_argument(
        '--batch_size', type=int, default=5, help='num of batch size')
    parser.add_argument(
        '--print_batch', type=int, default=10, help='num of print batch')
Z
zhangwenhui03 已提交
33 34
    parser.add_argument(
        '--pass_num', type=int, default=10, help='number of epoch')
F
frankwhzhang 已提交
35 36 37 38 39 40
    parser.add_argument(
        '--use_cuda', type=int, default=0, help='whether use gpu')
    parser.add_argument(
        '--parallel', type=int, default=0, help='whether parallel')
    parser.add_argument(
        '--base_lr', type=float, default=0.01, help='learning rate')
F
frankwhzhang 已提交
41
    parser.add_argument(
F
8.3  
frankwhzhang 已提交
42
        '--num_devices', type=int, default=1, help='Number of GPU devices')
Z
zhengya01 已提交
43 44 45 46
    parser.add_argument(
        '--enable_ce',
        action='store_true',
        help='If set, run the task with continuous evaluation logs.')
F
frankwhzhang 已提交
47 48 49
    args = parser.parse_args()
    return args

Z
zhangwenhui03 已提交
50

F
frankwhzhang 已提交
51 52
def get_cards(args):
    return args.num_devices
F
frankwhzhang 已提交
53

Z
zhangwenhui03 已提交
54

F
frankwhzhang 已提交
55 56
def train():
    """ do training """
F
frankwhzhang 已提交
57
    args = parse_args()
Z
zhengya01 已提交
58 59 60
    if args.enable_ce:
       fluid.default_startup_program().random_seed = SEED 
       fluid.default_main_program().random_seed = SEED 
F
frankwhzhang 已提交
61 62 63 64 65 66 67 68 69 70
    hid_size = args.hid_size
    train_dir = args.train_dir
    vocab_path = args.vocab_path
    use_cuda = True if args.use_cuda else False
    parallel = True if args.parallel else False
    print("use_cuda:", use_cuda, "parallel:", parallel)
    batch_size = args.batch_size
    vocab_size, train_reader = utils.prepare_data(
        train_dir, vocab_path, batch_size=batch_size * get_cards(args),\
        buffer_size=1000, word_freq_threshold=0, is_train=True)
F
frankwhzhang 已提交
71 72

    # Train program
Z
zhangwenhui03 已提交
73
    src_wordseq, dst_wordseq, avg_cost, acc = net.all_vocab_network(
Z
zhangwenhui03 已提交
74
        vocab_size=vocab_size, hid_size=hid_size)
F
frankwhzhang 已提交
75 76

    # Optimization to minimize lost
F
frankwhzhang 已提交
77
    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
F
frankwhzhang 已提交
78
    sgd_optimizer.minimize(avg_cost)
Z
zhangwenhui03 已提交
79

F
frankwhzhang 已提交
80 81 82
    # Initialize executor
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
F
frankwhzhang 已提交
83
    exe.run(fluid.default_startup_program())
F
8.3  
frankwhzhang 已提交
84 85
    if parallel:
        train_exe = fluid.ParallelExecutor(
Z
zhangwenhui03 已提交
86
            use_cuda=use_cuda, loss_name=avg_cost.name)
F
8.3  
frankwhzhang 已提交
87
    else:
F
frankwhzhang 已提交
88
        train_exe = exe
Z
zhangwenhui03 已提交
89

F
frankwhzhang 已提交
90 91 92
    pass_num = args.pass_num
    model_dir = args.model_dir
    fetch_list = [avg_cost.name]
F
frankwhzhang 已提交
93

Z
zhengya01 已提交
94
    ce_info = []
F
frankwhzhang 已提交
95 96 97 98
    total_time = 0.0
    for pass_idx in six.moves.xrange(pass_num):
        epoch_idx = pass_idx + 1
        print("epoch_%d start" % epoch_idx)
F
frankwhzhang 已提交
99

F
frankwhzhang 已提交
100 101 102 103 104 105 106 107 108
        t0 = time.time()
        i = 0
        newest_ppl = 0
        for data in train_reader():
            i += 1
            lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data],
                                                 place)
            lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data],
                                                 place)
Z
zhangwenhui03 已提交
109 110 111 112 113
            ret_avg_cost = train_exe.run(feed={
                "src_wordseq": lod_src_wordseq,
                "dst_wordseq": lod_dst_wordseq
            },
                                         fetch_list=fetch_list)
F
frankwhzhang 已提交
114 115
            avg_ppl = np.exp(ret_avg_cost[0])
            newest_ppl = np.mean(avg_ppl)
Z
zhengya01 已提交
116
            ce_info.append(newest_ppl)
F
frankwhzhang 已提交
117 118
            if i % args.print_batch == 0:
                print("step:%d ppl:%.3f" % (i, newest_ppl))
Z
zhengya01 已提交
119 120
            if args.enable_ce and i > 1000:
                break
F
frankwhzhang 已提交
121

F
frankwhzhang 已提交
122 123 124 125 126 127 128 129 130
        t1 = time.time()
        total_time += t1 - t0
        print("epoch:%d num_steps:%d time_cost(s):%f" %
              (epoch_idx, i, total_time / epoch_idx))
        save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
        feed_var_names = ["src_wordseq", "dst_wordseq"]
        fetch_vars = [avg_cost, acc]
        fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
        print("model saved in %s" % save_dir)
Z
zhengya01 已提交
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154

    # only for ce
    if args.enable_ce:
        ce_ppl = 0
        try:
            ce_ppl = ce_info[-2]
        except:
            print("ce info error")
        epoch_idx = args.pass_num
        device = get_device(args)
        if args.use_cuda:
            gpu_num = device[1]
            print("kpis\teach_pass_duration_gpu%s\t%s" %
                (gpu_num, total_time / epoch_idx))
            print("kpis\ttrain_ppl_gpu%s\t%s" %
                (gpu_num, ce_ppl))
        else:
            cpu_num = device[1]
            threads_num = device[2]
            print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" %
                (cpu_num, threads_num, total_time / epoch_idx))
            print("kpis\ttrain_ppl_cpu%s_thread%s\t%s" %
                (cpu_num, threads_num, ce_ppl))
        
F
frankwhzhang 已提交
155
    print("finish training")
F
frankwhzhang 已提交
156

F
frankwhzhang 已提交
157

Z
zhengya01 已提交
158 159 160 161 162 163 164 165 166 167 168
def get_device(args):
    if args.use_cuda:
        gpus = os.environ.get("CUDA_VISIBLE_DEVICES", 1)
        gpu_num = len(gpus.split(','))
        return "gpu", gpu_num
    else:
        threads_num = os.environ.get('NUM_THREADS', 1)
        cpu_num = os.environ.get('CPU_NUM', 1)
        return "cpu", int(cpu_num), int(threads_num)
        

F
frankwhzhang 已提交
169
if __name__ == "__main__":
F
frankwhzhang 已提交
170
    train()