train.py 6.5 KB
Newer Older
F
frankwhzhang 已提交
1 2 3 4 5 6 7 8 9 10 11
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import paddle.fluid as fluid
import paddle
import time
import utils
F
8.3  
frankwhzhang 已提交
12

F
frankwhzhang 已提交
13 14
SEED = 102

F
8.3  
frankwhzhang 已提交
15

F
frankwhzhang 已提交
16 17
def parse_args():
    parser = argparse.ArgumentParser("gru4rec benchmark.")
F
8.3  
frankwhzhang 已提交
18 19 20
    parser.add_argument('train_file')
    parser.add_argument('test_file')

F
frankwhzhang 已提交
21 22 23 24 25 26
    parser.add_argument(
        '--enable_ce',
        action='store_true',
        help='If set, run \
        the task with continuous evaluation logs.')
    parser.add_argument(
F
8.3  
frankwhzhang 已提交
27
        '--num_devices', type=int, default=1, help='Number of GPU devices')
F
frankwhzhang 已提交
28 29 30
    args = parser.parse_args()
    return args

F
frankwhzhang 已提交
31

F
frankwhzhang 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
    """ network definition """
    emb_lr_x = 10.0
    gru_lr_x = 1.0
    fc_lr_x = 1.0
    emb = fluid.layers.embedding(
        input=src,
        size=[vocab_size, hid_size],
        param_attr=fluid.ParamAttr(
            initializer=fluid.initializer.Uniform(
                low=init_low_bound, high=init_high_bound),
            learning_rate=emb_lr_x),
        is_sparse=True)

    fc0 = fluid.layers.fc(input=emb,
                          size=hid_size * 3,
                          param_attr=fluid.ParamAttr(
                              initializer=fluid.initializer.Uniform(
                                  low=init_low_bound, high=init_high_bound),
                              learning_rate=gru_lr_x))
    gru_h0 = fluid.layers.dynamic_gru(
        input=fc0,
        size=hid_size,
        param_attr=fluid.ParamAttr(
            initializer=fluid.initializer.Uniform(
                low=init_low_bound, high=init_high_bound),
            learning_rate=gru_lr_x))

    fc = fluid.layers.fc(input=gru_h0,
                         size=vocab_size,
                         act='softmax',
                         param_attr=fluid.ParamAttr(
                             initializer=fluid.initializer.Uniform(
                                 low=init_low_bound, high=init_high_bound),
                             learning_rate=fc_lr_x))

    cost = fluid.layers.cross_entropy(input=fc, label=dst)
    acc = fluid.layers.accuracy(input=fc, label=dst, k=20)
    return cost, acc

F
frankwhzhang 已提交
72

F
frankwhzhang 已提交
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
def train(train_reader,
          vocab,
          network,
          hid_size,
          base_lr,
          batch_size,
          pass_num,
          use_cuda,
          parallel,
          model_dir,
          init_low_bound=-0.04,
          init_high_bound=0.04):
    """ train network """

    args = parse_args()
    if args.enable_ce:
        # random seed must set before configuring the network.
        fluid.default_startup_program().random_seed = SEED

    vocab_size = len(vocab)

    # Input data
    src_wordseq = fluid.layers.data(
        name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
    dst_wordseq = fluid.layers.data(
        name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)

    # Train program
    avg_cost = None
F
8.3  
frankwhzhang 已提交
102 103
    cost, acc = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
                        init_low_bound, init_high_bound)
F
frankwhzhang 已提交
104 105 106 107 108 109 110 111 112 113
    avg_cost = fluid.layers.mean(x=cost)

    # Optimization to minimize lost
    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=base_lr)
    sgd_optimizer.minimize(avg_cost)

    # Initialize executor
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
F
8.3  
frankwhzhang 已提交
114 115 116 117
    if parallel:
        train_exe = fluid.ParallelExecutor(
            use_cuda=use_cuda, loss_name=avg_cost.name)
    else:
F
frankwhzhang 已提交
118 119 120 121 122 123 124 125 126
        train_exe = exe
    total_time = 0.0
    fetch_list = [avg_cost.name]
    for pass_idx in six.moves.xrange(pass_num):
        epoch_idx = pass_idx + 1
        print "epoch_%d start" % epoch_idx

        t0 = time.time()
        i = 0
F
8.3  
frankwhzhang 已提交
127
        newest_ppl = 0
F
frankwhzhang 已提交
128 129
        for data in train_reader():
            i += 1
F
8.3  
frankwhzhang 已提交
130 131 132 133 134
            lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data],
                                                 place)
            lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data],
                                                 place)
            ret_avg_cost = train_exe.run(feed={
F
frankwhzhang 已提交
135 136 137
                "src_wordseq": lod_src_wordseq,
                "dst_wordseq": lod_dst_wordseq
            },
F
8.3  
frankwhzhang 已提交
138
                                         fetch_list=fetch_list)
F
frankwhzhang 已提交
139
            avg_ppl = np.exp(ret_avg_cost[0])
F
8.3  
frankwhzhang 已提交
140
            newest_ppl = np.mean(avg_ppl)
F
frankwhzhang 已提交
141
            if i % 10 == 0:
F
8.3  
frankwhzhang 已提交
142
                print("step:%d ppl:%.3f" % (i, newest_ppl))
F
frankwhzhang 已提交
143 144 145

        t1 = time.time()
        total_time += t1 - t0
F
8.3  
frankwhzhang 已提交
146 147
        print("epoch:%d num_steps:%d time_cost(s):%f" %
              (epoch_idx, i, total_time / epoch_idx))
F
frankwhzhang 已提交
148

F
8.3  
frankwhzhang 已提交
149
        if pass_idx == pass_num - 1 and args.enable_ce:
F
frankwhzhang 已提交
150 151 152 153
            #Note: The following logs are special for CE monitoring.
            #Other situations do not need to care about these logs.
            gpu_num = get_cards(args.enable_ce)
            if gpu_num == 1:
F
8.3  
frankwhzhang 已提交
154
                print("kpis    rsc15_pass_duration    %s" %
F
frankwhzhang 已提交
155
                      (total_time / epoch_idx))
F
8.3  
frankwhzhang 已提交
156
                print("kpis    rsc15_avg_ppl    %s" % newest_ppl)
F
frankwhzhang 已提交
157
            else:
F
8.3  
frankwhzhang 已提交
158
                print("kpis    rsc15_pass_duration_card%s    %s" % \
F
frankwhzhang 已提交
159
                      (gpu_num, total_time / epoch_idx))
F
8.3  
frankwhzhang 已提交
160
                print("kpis    rsc15_avg_ppl_card%s    %s" %
F
frankwhzhang 已提交
161 162 163 164 165 166 167 168 169
                      (gpu_num, newest_ppl))
        save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
        feed_var_names = ["src_wordseq", "dst_wordseq"]
        fetch_vars = [avg_cost, acc]
        fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
        print("model saved in %s" % save_dir)

    print("finish training")

F
8.3  
frankwhzhang 已提交
170

F
frankwhzhang 已提交
171 172 173 174 175 176 177 178
def get_cards(args):
    if args.enable_ce:
        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
        num = len(cards.split(","))
        return num
    else:
        return args.num_devices

F
frankwhzhang 已提交
179

F
frankwhzhang 已提交
180 181 182
def train_net():
    """ do training """
    args = parse_args()
F
8.3  
frankwhzhang 已提交
183 184 185
    train_file = args.train_file
    test_file = args.test_file
    batch_size = 50
F
frankwhzhang 已提交
186
    vocab, train_reader, test_reader = utils.prepare_data(
F
8.3  
frankwhzhang 已提交
187 188
        train_file, test_file,batch_size=batch_size * get_cards(args),\
        buffer_size=1000, word_freq_threshold=0)
F
frankwhzhang 已提交
189 190 191 192 193 194 195 196 197 198
    train(
        train_reader=train_reader,
        vocab=vocab,
        network=network,
        hid_size=100,
        base_lr=0.01,
        batch_size=batch_size,
        pass_num=10,
        use_cuda=True,
        parallel=False,
F
8.3  
frankwhzhang 已提交
199
        model_dir="model_recall20",
F
frankwhzhang 已提交
200 201 202
        init_low_bound=-0.1,
        init_high_bound=0.1)

F
frankwhzhang 已提交
203

F
frankwhzhang 已提交
204 205
if __name__ == "__main__":
    train_net()