prepare.py

import paddle.v2 as paddle
import tarfile
import os
import pickle

SPLIT_COUNT = 3
N = 5


def file_len(fd):
    for i, l in enumerate(fd):
        pass
    return i + 1


def split_from_reader_by_line(filename, reader, split_count):
    fn = open(filename, "w")
    for batch_id, batch_data in enumerate(reader()):
        batch_data_str = [str(d) for d in batch_data]
        fn.write(",".join(batch_data_str))
        fn.write("\n")
    fn.close()

    fn = open(filename, "r")
    total_line_count = file_len(fn)
    fn.close()
    per_file_lines = total_line_count / split_count + 1
    cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename)
    os.system(cmd)


word_dict = paddle.dataset.imikolov.build_dict()
with open("word_dict.pickle", "w") as dict_f:
    pickle.dump(word_dict, dict_f)

split_from_reader_by_line("train.txt",
                          paddle.dataset.imikolov.train(word_dict, N),
                          SPLIT_COUNT)
split_from_reader_by_line("test.txt",
                          paddle.dataset.imikolov.test(word_dict, N),
                          SPLIT_COUNT)