utils.py 1.5 KB
Newer Older
Y
Yi Liu 已提交
1 2 3 4 5
import sys
import time
import numpy as np

import paddle.fluid as fluid
G
guochaorong 已提交
6
import paddle
Y
Yi Liu 已提交
7

G
guochaorong 已提交
8

Y
Yi Liu 已提交
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
def to_lodtensor(data, place):
    """ convert to LODtensor """
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
    lod = [cur_len]
    for l in seq_lens:
        cur_len += l
        lod.append(cur_len)
    flattened_data = np.concatenate(data, axis=0).astype("int64")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
    res = fluid.LoDTensor()
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res


G
guochaorong 已提交
25 26 27 28
def prepare_data(batch_size,
                 buffer_size=1000,
                 word_freq_threshold=0,
                 enable_ce=False):
Y
Yi Liu 已提交
29 30
    """ prepare the English Pann Treebank (PTB) data """
    vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold)
G
guochaorong 已提交
31 32
    if enable_ce:
        train_reader = paddle.batch(
Y
Yi Liu 已提交
33 34 35 36
            paddle.dataset.imikolov.train(
                vocab,
                buffer_size,
                data_type=paddle.dataset.imikolov.DataType.SEQ),
G
guochaorong 已提交
37 38 39 40 41 42 43 44 45 46
            batch_size)
    else:
        train_reader = paddle.batch(
            paddle.reader.shuffle(
                paddle.dataset.imikolov.train(
                    vocab,
                    buffer_size,
                    data_type=paddle.dataset.imikolov.DataType.SEQ),
                buf_size=buffer_size),
            batch_size)
Y
Yi Liu 已提交
47 48 49 50 51
    test_reader = paddle.batch(
        paddle.dataset.imikolov.test(
            vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ),
        batch_size)
    return vocab, train_reader, test_reader