creator.py 3.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14
"""
15 16
Creator package contains some simple reader creator, which could
be used in user program.
17
"""
18

19
__all__ = ['np_array', 'text_file', "cloud_reader"]
20 21 22 23


def np_array(x):
    """
H
Helin Wang 已提交
24 25 26
    Creates a reader that yields elements of x, if it is a
    numpy vector. Or rows of x, if it is a numpy matrix.
    Or any sub-hyperplane indexed by the highest dimension.
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44

    :param x: the numpy array to create reader from.
    :returns: data reader created from x.
    """

    def reader():
        if x.ndim < 1:
            yield x

        for e in x:
            yield e

    return reader


def text_file(path):
    """
    Creates a data reader that outputs text line by line from given text file.
45
    Trailing new line ('\\\\n') of each line will be removed.
46 47 48 49 50 51 52 53 54 55 56 57

    :path: path of the text file.
    :returns: data reader of text file
    """

    def reader():
        f = open(path, "r")
        for l in f:
            yield l.rstrip('\n')
        f.close()

    return reader
58 59


H
Helin Wang 已提交
60
def recordio(paths, buf_size=100):
61
    """
62
    Creates a data reader from given RecordIO file paths separated by ",",
G
gongweibao 已提交
63
        glob pattern is supported.
G
gongweibao 已提交
64 65
    :path: path of recordio files.
    :returns: data reader of recordio files.
66 67
    """

G
gongweibao 已提交
68
    import recordio as rec
G
gongweibao 已提交
69
    import paddle.v2.reader.decorator as dec
H
Helin Wang 已提交
70
    import cPickle as pickle
G
gongweibao 已提交
71

72
    def reader():
H
Helin Wang 已提交
73 74 75 76 77
        if isinstance(paths, basestring):
            path = paths
        else:
            path = ",".join(paths)
        f = rec.reader(path)
G
gongweibao 已提交
78 79 80 81
        while True:
            r = f.read()
            if r is None:
                break
H
Helin Wang 已提交
82
            yield pickle.loads(r)
G
gongweibao 已提交
83 84 85
        f.close()

    return dec.buffered(reader, buf_size)
86

G
gongweibao 已提交
87

88 89 90 91
pass_num = 0


def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
G
gongweibao 已提交
92
    """
93 94
    Create a data reader that yield a record one by one from
        the paths:
G
gongweibao 已提交
95
    :path: path of recordio files.
96
    :etcd_endpoints: the endpoints for etcd cluster
G
gongweibao 已提交
97
    :returns: data reader of recordio files.
98 99 100 101 102 103 104

    ..  code-block:: python
        from paddle.v2.reader.creator import cloud_reader
        etcd_endpoints = "http://127.0.0.1:2379"
        trainer.train.(
            reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints),
        )
G
gongweibao 已提交
105 106
    """
    import os
107 108 109 110
    import cPickle as pickle
    import paddle.v2.master as master
    c = master.client(etcd_endpoints, timeout_sec, buf_size)
    c.set_dataset(paths)
G
gongweibao 已提交
111

G
gongweibao 已提交
112
    def reader():
113 114 115
        global pass_num
        c.paddle_start_get_records(pass_num)
        pass_num += 1
G
gongweibao 已提交
116 117

        while True:
118 119 120 121
            r, e = c.next_record()
            if not r:
                if e != -2:
                    print "get record error: ", e
G
gongweibao 已提交
122
                break
123
            yield pickle.loads(r)
G
gongweibao 已提交
124 125

    return reader