train.py 6.1 KB
Newer Older
G
guochaorong 已提交
1
import os
Y
Yi Liu 已提交
2 3 4 5 6
import sys
import time

import numpy as np
import math
G
guochaorong 已提交
7
import argparse
Y
Yi Liu 已提交
8
import paddle.fluid as fluid
G
guochaorong 已提交
9
import paddle
Y
Yi Liu 已提交
10 11 12

import utils

G
guochaorong 已提交
13 14 15 16 17 18 19 20 21 22 23 24 25
SEED = 102


def parse_args():
    parser = argparse.ArgumentParser("language_model benchmark.")
    parser.add_argument(
        '--enable_ce',
        action='store_true',
        help='If set, run \
        the task with continuous evaluation logs.')
    args = parser.parse_args()
    return args

Y
Yi Liu 已提交
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79

def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
    """ network definition """
    emb_lr_x = 10.0
    gru_lr_x = 1.0
    fc_lr_x = 1.0
    emb = fluid.layers.embedding(
        input=src,
        size=[vocab_size, hid_size],
        param_attr=fluid.ParamAttr(
            initializer=fluid.initializer.Uniform(
                low=init_low_bound, high=init_high_bound),
            learning_rate=emb_lr_x),
        is_sparse=True)

    fc0 = fluid.layers.fc(input=emb,
                          size=hid_size * 3,
                          param_attr=fluid.ParamAttr(
                              initializer=fluid.initializer.Uniform(
                                  low=init_low_bound, high=init_high_bound),
                              learning_rate=gru_lr_x))
    gru_h0 = fluid.layers.dynamic_gru(
        input=fc0,
        size=hid_size,
        param_attr=fluid.ParamAttr(
            initializer=fluid.initializer.Uniform(
                low=init_low_bound, high=init_high_bound),
            learning_rate=gru_lr_x))

    fc = fluid.layers.fc(input=gru_h0,
                         size=vocab_size,
                         act='softmax',
                         param_attr=fluid.ParamAttr(
                             initializer=fluid.initializer.Uniform(
                                 low=init_low_bound, high=init_high_bound),
                             learning_rate=fc_lr_x))

    cost = fluid.layers.cross_entropy(input=fc, label=dst)
    return cost


def train(train_reader,
          vocab,
          network,
          hid_size,
          base_lr,
          batch_size,
          pass_num,
          use_cuda,
          parallel,
          model_dir,
          init_low_bound=-0.04,
          init_high_bound=0.04):
    """ train network """
G
guochaorong 已提交
80 81 82 83 84

    args = parse_args()
    if args.enable_ce:
        # random seed must set before configuring the network.
        fluid.default_startup_program().random_seed = SEED
Y
Yi Liu 已提交
85 86
    vocab_size = len(vocab)

G
guochaorong 已提交
87
    #Input data
Y
Yi Liu 已提交
88 89 90 91 92
    src_wordseq = fluid.layers.data(
        name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
    dst_wordseq = fluid.layers.data(
        name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)

G
guochaorong 已提交
93
    # Train program
Y
Yi Liu 已提交
94
    avg_cost = None
G
guochaorong 已提交
95
    cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
G
guochaorong 已提交
96
                   init_low_bound, init_high_bound)
G
guochaorong 已提交
97
    avg_cost = fluid.layers.mean(x=cost)
Y
Yi Liu 已提交
98

G
guochaorong 已提交
99
    # Optimization to minimize lost
Y
Yi Liu 已提交
100 101 102 103 104 105 106 107
    sgd_optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=base_lr,
            decay_steps=2100 * 4,
            decay_rate=0.5,
            staircase=True))
    sgd_optimizer.minimize(avg_cost)

G
guochaorong 已提交
108
    # Initialize executor
Y
Yi Liu 已提交
109 110 111
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
G
guochaorong 已提交
112 113 114

    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)

Y
Yi Liu 已提交
115
    total_time = 0.0
G
guochaorong 已提交
116
    fetch_list = [avg_cost.name]
Y
Yi Liu 已提交
117 118 119 120 121 122
    for pass_idx in xrange(pass_num):
        epoch_idx = pass_idx + 1
        print "epoch_%d start" % epoch_idx

        t0 = time.time()
        i = 0
G
guochaorong 已提交
123
        newest_ppl = 0
Y
Yi Liu 已提交
124 125 126 127 128 129
        for data in train_reader():
            i += 1
            lod_src_wordseq = utils.to_lodtensor(
                map(lambda x: x[0], data), place)
            lod_dst_wordseq = utils.to_lodtensor(
                map(lambda x: x[1], data), place)
G
guochaorong 已提交
130 131 132 133 134
            ret_avg_cost = train_exe.run(feed={
                "src_wordseq": lod_src_wordseq,
                "dst_wordseq": lod_dst_wordseq
            },
                                         fetch_list=fetch_list)
G
guochaorong 已提交
135 136
            avg_ppl = np.exp(ret_avg_cost[0])
            newest_ppl = np.mean(avg_ppl)
Y
Yi Liu 已提交
137
            if i % 100 == 0:
G
guochaorong 已提交
138
                print "step:%d ppl:%.3f" % (i, newest_ppl)
Y
Yi Liu 已提交
139 140 141

        t1 = time.time()
        total_time += t1 - t0
G
guochaorong 已提交
142 143
        print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i,
                                                         total_time / epoch_idx)
G
guochaorong 已提交
144

G
guochaorong 已提交
145
        if pass_idx == pass_num - 1 and args.enable_ce:
G
guochaorong 已提交
146 147 148 149
            #Note: The following logs are special for CE monitoring.
            #Other situations do not need to care about these logs.
            gpu_num = get_cards()
            if gpu_num == 1:
G
guochaorong 已提交
150 151
                print("kpis	imikolov_20_pass_duration	%s" %
                      (total_time / epoch_idx))
G
guochaorong 已提交
152 153 154 155
                print("kpis	imikolov_20_avg_ppl	%s" % newest_ppl)
            else:
                print("kpis	imikolov_20_pass_duration_card%s	%s" % \
                                (gpu_num, total_time / epoch_idx))
G
guochaorong 已提交
156 157
                print("kpis	imikolov_20_avg_ppl_card%s	%s" %
                      (gpu_num, newest_ppl))
Y
Yi Liu 已提交
158 159 160
        save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
        feed_var_names = ["src_wordseq", "dst_wordseq"]
        fetch_vars = [avg_cost]
G
guochaorong 已提交
161
        fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
Y
Yi Liu 已提交
162 163 164 165
        print("model saved in %s" % save_dir)

    print("finish training")

G
guochaorong 已提交
166

G
guochaorong 已提交
167 168 169 170
def get_cards():
    cards = os.environ.get('CUDA_VISIBLE_DEVICES')
    num = len(cards.split(","))
    return num
Y
Yi Liu 已提交
171

G
guochaorong 已提交
172

Y
Yi Liu 已提交
173 174 175
def train_net():
    """ do training """
    batch_size = 20
G
guochaorong 已提交
176
    args = parse_args()
Y
Yi Liu 已提交
177
    vocab, train_reader, test_reader = utils.prepare_data(
G
guochaorong 已提交
178 179
        batch_size=batch_size * get_cards(), buffer_size=1000, \
        word_freq_threshold=0, enable_ce = args.enable_ce)
Y
Yi Liu 已提交
180 181 182 183 184 185 186 187 188
    train(
        train_reader=train_reader,
        vocab=vocab,
        network=network,
        hid_size=200,
        base_lr=1.0,
        batch_size=batch_size,
        pass_num=12,
        use_cuda=True,
G
guochaorong 已提交
189
        parallel=True,
Y
Yi Liu 已提交
190 191 192 193 194 195 196
        model_dir="model",
        init_low_bound=-0.1,
        init_high_bound=0.1)


if __name__ == "__main__":
    train_net()