train_on_cloud.py 9.2 KB
Newer Older
1 2 3
import os
import sys
import time
Y
Yibing Liu 已提交
4
import six
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52

import numpy as np
import math

import collections
import paddle
import paddle.fluid as fluid
import paddle.fluid.framework as framework

cluster_train_dir = "./train/"
cluster_test_dir = "./test/"
train_file = "ptb.train.txt"
valid_file = "ptb.valid.txt"
test_file = "ptb.test.txt"


class DataType(object):
    """ data type """
    NGRAM = 1
    SEQ = 2


def word_count(f, word_freq=None):
    """ count words """
    if word_freq is None:
        word_freq = collections.defaultdict(int)

    for line in f:
        for w in line.strip().split():
            word_freq[w] += 1
        word_freq['<s>'] += 1
        word_freq['<e>'] += 1

    return word_freq


def build_dict(min_word_freq=50):
    """ build dictionary """
    train_filename = cluster_train_dir + train_file
    test_filename = cluster_test_dir + valid_file
    trainf = open(train_filename).readlines()
    testf = open(test_filename).readlines()
    word_freq = word_count(testf, word_count(trainf))
    if '<unk>' in word_freq:
        del word_freq['<unk>']
    word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
    word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*word_freq_sorted))
Y
Yibing Liu 已提交
53
    word_idx = dict(zip(words, six.moves.xrange(len(words))))
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
    word_idx['<unk>'] = len(words)
    return word_idx


def reader_creator(filename, word_idx, n, data_type):
    """ create reader """

    def reader():
        if True:
            f = open(filename).readlines()
            UNK = word_idx['<unk>']
            for line in f:
                if DataType.NGRAM == data_type:
                    assert n > -1, 'Invalid gram length'
                    line = ['<s>'] + line.strip().split() + ['<e>']
                    if len(line) >= n:
                        line = [word_idx.get(w, UNK) for w in line]
                        for i in range(n, len(line) + 1):
                            yield tuple(line[i - n:i])
                elif DataType.SEQ == data_type:
                    line = line.strip().split()
                    line = [word_idx.get(w, UNK) for w in line]
                    src_seq = [word_idx['<s>']] + line
                    trg_seq = line + [word_idx['<e>']]
                    if n > 0 and len(src_seq) > n:
                        continue
                    yield src_seq, trg_seq
                else:
                    assert False, 'Unknow data type'

    return reader


def to_lodtensor(data, place):
    """ convert to LODtensor """
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
    lod = [cur_len]
    for line in seq_lens:
        cur_len += line
        lod.append(cur_len)
    flattened_data = np.concatenate(data, axis=0).astype("int64")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
    res = fluid.LoDTensor()
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res


def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0):
    """ prepare the English Pann Treebank (PTB) data """
    vocab = build_dict(word_freq_threshold)
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            reader_creator(
                cluster_train_dir + train_file,
                vocab,
                buffer_size,
                data_type=DataType.SEQ),
            buf_size=buffer_size),
        batch_size)
    test_reader = paddle.batch(
        reader_creator(
            cluster_test_dir + test_file,
            vocab,
            buffer_size,
            data_type=DataType.SEQ),
        batch_size)
    return vocab, train_reader, test_reader


def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
    """ network definition """
    emb_lr_x = 10.0
    gru_lr_x = 1.0
    fc_lr_x = 1.0
    emb = fluid.layers.embedding(
        input=src,
        size=[vocab_size, hid_size],
        param_attr=fluid.ParamAttr(
            initializer=fluid.initializer.Uniform(
                low=init_low_bound, high=init_high_bound),
            learning_rate=emb_lr_x),
        is_sparse=True)

    fc0 = fluid.layers.fc(input=emb,
                          size=hid_size * 3,
                          param_attr=fluid.ParamAttr(
                              initializer=fluid.initializer.Uniform(
                                  low=init_low_bound, high=init_high_bound),
                              learning_rate=gru_lr_x))
    gru_h0 = fluid.layers.dynamic_gru(
        input=fc0,
        size=hid_size,
        param_attr=fluid.ParamAttr(
            initializer=fluid.initializer.Uniform(
                low=init_low_bound, high=init_high_bound),
            learning_rate=gru_lr_x))

    fc = fluid.layers.fc(input=gru_h0,
                         size=vocab_size,
                         act='softmax',
                         param_attr=fluid.ParamAttr(
                             initializer=fluid.initializer.Uniform(
                                 low=init_low_bound, high=init_high_bound),
                             learning_rate=fc_lr_x))

    cost = fluid.layers.cross_entropy(input=fc, label=dst)
    return cost


def do_train(train_reader,
             vocab,
             network,
             hid_size,
             base_lr,
             batch_size,
             pass_num,
             use_cuda,
             parallel,
             model_dir,
             init_low_bound=-0.04,
             init_high_bound=0.04):
    """ train network """
    vocab_size = len(vocab)

    src_wordseq = fluid.layers.data(
        name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
    dst_wordseq = fluid.layers.data(
        name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)

    avg_cost = None
    if not parallel:
        cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
                       init_low_bound, init_high_bound)
        avg_cost = fluid.layers.mean(x=cost)
    else:
191
        places = fluid.layers.device.get_places()
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            cost = network(
                pd.read_input(src_wordseq),
                pd.read_input(dst_wordseq), vocab_size, hid_size,
                init_low_bound, init_high_bound)
            pd.write_output(cost)

        cost = pd()
        avg_cost = fluid.layers.mean(x=cost)

    sgd_optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=base_lr,
            decay_steps=2100 * 4,
            decay_rate=0.5,
            staircase=True))
    sgd_optimizer.minimize(avg_cost)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)

    exe.run(fluid.default_startup_program())
    total_time = 0.0
Y
Yibing Liu 已提交
216
    for pass_idx in six.moves.xrange(pass_num):
217
        epoch_idx = pass_idx + 1
Y
Yibing Liu 已提交
218
        print("epoch_%d start" % epoch_idx)
219 220 221 222 223

        t0 = time.time()
        i = 0
        for data in train_reader():
            i += 1
Y
Yibing Liu 已提交
224 225
            lod_src_wordseq = to_lodtensor([dat[0] for dat in data], place)
            lod_dst_wordseq = to_lodtensor([dat[1] for dat in data], place)
226 227 228 229 230 231 232 233 234
            ret_avg_cost = exe.run(fluid.default_main_program(),
                                   feed={
                                       "src_wordseq": lod_src_wordseq,
                                       "dst_wordseq": lod_dst_wordseq
                                   },
                                   fetch_list=[avg_cost],
                                   use_program_cache=True)
            avg_ppl = math.exp(ret_avg_cost[0])
            if i % 100 == 0:
Y
Yibing Liu 已提交
235
                print("step:%d ppl:%.3f" % (i, avg_ppl))
236 237 238

        t1 = time.time()
        total_time += t1 - t0
Y
Yibing Liu 已提交
239 240
        print("epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i,
                                                         total_time / epoch_idx))
241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261

        save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
        feed_var_names = ["src_wordseq", "dst_wordseq"]
        fetch_vars = [avg_cost]
        fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
        print("model saved in %s" % save_dir)

    print("finish training")


def train():
    """ do training """
    batch_size = 20
    vocab, train_reader, test_reader = prepare_data(
        batch_size=batch_size, buffer_size=1000, word_freq_threshold=0)

    # End batch and end pass event handler
    def event_handler(event):
        """ event handler """
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 100 == 0:
Y
Yibing Liu 已提交
262 263
                print("\nPass %d, Batch %d, Cost %f, %s" % (
                    event.pass_id, event.batch_id, event.cost, event.metrics))
264 265 266 267
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
        if isinstance(event, paddle.event.EndPass):
Y
Yibing Liu 已提交
268
            print("isinstance(event, paddle.event.EndPass)")
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288

    do_train(
        train_reader=train_reader,
        vocab=vocab,
        network=network,
        hid_size=200,
        base_lr=1.0,
        batch_size=batch_size,
        pass_num=12,
        use_cuda=True,
        parallel=False,
        model_dir="./output/model",
        init_low_bound=-0.1,
        init_high_bound=0.1)


if __name__ == "__main__":
    if not os.path.exists("./output/model"):
        os.makedirs("./output/model")
    train()