recordio_converter.py

# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import random
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.dataset import mnist, cifar, flowers, image


def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
                       shape_label):
    num_batches = 0
    with fluid.program_guard(fluid.Program(), fluid.Program()):
        reader = paddle.batch(py_reader(), batch_size=batch_size)
        feeder = fluid.DataFeeder(
            feed_list=[  # order is image and label
                fluid.layers.data(
                    name='image', shape=shape_data),
                fluid.layers.data(
                    name='label', shape=shape_label, dtype='int64'),
            ],
            place=fluid.CPUPlace())
        num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
            outfilepath, reader, feeder)
    return num_batches


def prepare_mnist(outpath, batch_size):
    outfilepath = os.path.join(outpath, "mnist.recordio")
    convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])


def prepare_cifar10(outpath, batch_size):
    outfilepath = os.path.join(outpath, "cifar.recordio")
    convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])


def prepare_flowers(outpath, batch_size):
    outfilepath = os.path.join(outpath, "flowers.recordio")
    convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
                       [1])


def default_mapper(sample):
    img, label = sample
    img = image.simple_transform(
        img, 256, 224, True, mean=[103.94, 116.78, 123.68])
    return img.flatten().astype('float32'), label


def imagenet_train(data_dir):
    contents = os.listdir(data_dir)
    if set(contents) != set(
        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
        raise Exception("Imagenet data contents error!")
    img2label = dict()
    imgfilelist = []
    with open(os.path.join(data_dir, "train.txt")) as fn:
        while 1:
            l = fn.readline()
            if not l:
                break
            img, lbl = l[:-1].split(" ")
            img2label[img] = int(lbl)
            imgfilelist.append(img)
    # shuffle all, this is slow
    random.shuffle(imgfilelist)

    def train_reader():
        for idx, imgfile in enumerate(imgfilelist):
            data = image.load_image(
                os.path.join(data_dir, "train", imgfile.lower()))
            label = [img2label[imgfile], ]
            yield [data, label]

    return paddle.reader.map_readers(default_mapper, train_reader)


def imagenet_test(data_dir):
    contents = os.listdir(data_dir)
    if set(contents) != set(
        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
        raise Exception("Imagenet data contents error!")
    img2label = dict()
    imgfilelist = []
    with open(os.path.join(data_dir, "val.txt")) as fn:
        while 1:
            l = fn.readline()
            if not l:
                break
            img, lbl = l[:-1].split(" ")
            img2label[img] = int(lbl)
            imgfilelist.append(img)

    def test_reader():
        for idx, imgfile in enumerate(imgfilelist):
            base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
            image_path = ".".join([base_path, "jpeg"])
            data = image.load_image(image_path)
            label = [img2label[imgfile], ]
            yield [data, label]

    return paddle.reader.map_readers(default_mapper, test_reader)


# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
def convert_reader_to_recordio_files(
        filename,
        batch_per_file,
        reader_creator,
        feeder,
        compressor=core.RecordIOWriter.Compressor.Snappy,
        max_num_records=1000,
        feed_order=None):
    if feed_order is None:
        feed_order = feeder.feed_names
    f_name, f_ext = os.path.splitext(filename)
    assert (f_ext == ".recordio")

    lines = []
    f_idx = 0
    counter = 0
    for idx, batch in enumerate(reader_creator()):
        lines.append(batch)
        if idx >= batch_per_file and idx % batch_per_file == 0:
            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
            with fluid.recordio_writer.create_recordio_writer(
                    filename, compressor, max_num_records) as writer:
                for l in lines:
                    res = feeder.feed(l)
                    for each in feed_order:
                        writer.append_tensor(res[each])
                    writer.complete_append_tensor()
                    counter += 1
                lines = []
                f_idx += 1
            print("written file: ", filename)
    return counter


def prepare_imagenet(inpath, outpath, batch_size):
    r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
    feeder = fluid.DataFeeder(
        feed_list=[
            fluid.layers.data(
                name="image", shape=[3, 224, 224]), fluid.layers.data(
                    name="label", shape=[1], dtype='int64')
        ],
        place=fluid.CPUPlace())
    outpath = os.path.join(outpath, "imagenet.recordio")
    convert_reader_to_recordio_files(outpath, 10000, r, feeder)