diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index 27c82c95f79e0a3e3129627bfa33d85e0d3cd862..44a6e344630bb35d28ee29078bf8727053a24bef 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -14,7 +14,7 @@ __all__ = [ 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', - 'ComposeNotAligned', 'firstn', 'xmap_readers', 'pipe_reader' + 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader' ] from threading import Thread @@ -334,93 +334,72 @@ def _buf2lines(buf, line_break="\n"): return lines[:-1], lines[-1] -def pipe_reader(left_cmd, - parser, - bufsize=8192, - file_type="plain", - cut_lines=True, - line_break="\n"): +class PipeReader: """ - pipe_reader read data by stream from a command, take it's - stdout into a pipe buffer and redirect it to the parser to - parse, then yield data as your desired format. + PipeReader read data by stream from a command, take it's + stdout into a pipe buffer and redirect it to the parser to + parse, then yield data as your desired format. - You can using standard linux command or call another program - to read data, from HDFS, Ceph, URL, AWS S3 etc: + You can using standard linux command or call another program + to read data, from HDFS, Ceph, URL, AWS S3 etc: - cmd = "hadoop fs -cat /path/to/some/file" - cmd = "cat sample_file.tar.gz" - cmd = "curl http://someurl" - cmd = "python print_s3_bucket.py" + .. code-block:: python + cmd = "hadoop fs -cat /path/to/some/file" + cmd = "cat sample_file.tar.gz" + cmd = "curl http://someurl" + cmd = "python print_s3_bucket.py" - A sample parser: + An example: + + .. code-block:: python - def sample_parser(lines): - # parse each line as one sample data, - # return a list of samples as batches. - ret = [] - for l in lines: - ret.append(l.split(" ")[1:5]) - return ret - - :param left_cmd: command to excute to get stdout from. - :type left_cmd: string - :param parser: parser function to parse lines of data. - if cut_lines is True, parser will receive list - of lines. - if cut_lines is False, parser will receive a - raw buffer each time. - parser should return a list of parsed values. - :type parser: callable - :param bufsize: the buffer size used for the stdout pipe. - :type bufsize: int - :param file_type: can be plain/gzip, stream buffer data type. - :type file_type: string - :param cut_lines: whether to pass lines instead of raw buffer - to the parser - :type cut_lines: bool - :param line_break: line break of the file, like \n or \r - :type line_break: string - - :return: the reader generator. - :rtype: callable + def example_reader(): + for f in myfiles: + pr = PipeReader("cat %s"%f) + for l in pr.get_line(): + sample = l.split(" ") + yield sample """ - if not isinstance(left_cmd, str): - raise TypeError("left_cmd must be a string") - if not callable(parser): - raise TypeError("parser must be a callable object") - - # TODO(typhoonzero): add a thread to read stderr - - # Always init a decompress object is better than - # create in the loop. - dec = zlib.decompressobj( - 32 + zlib.MAX_WBITS) # offset 32 to skip the header - def reader(): - process = subprocess.Popen( - left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE) + def __init__(self, command, bufsize=8192, file_type="plain"): + if not isinstance(command, str): + raise TypeError("left_cmd must be a string") + if file_type == "gzip": + self.dec = zlib.decompressobj( + 32 + zlib.MAX_WBITS) # offset 32 to skip the header + self.file_type = file_type + self.bufsize = bufsize + self.process = subprocess.Popen( + command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE) + + def get_line(self, cut_lines=True, line_break="\n"): + """ + :param cut_lines: cut buffer to lines + :type cut_lines: bool + :param line_break: line break of the file, like \n or \r + :type line_break: string + + :return: one line or a buffer of bytes + :rtype: string + """ remained = "" while True: - buff = process.stdout.read(bufsize) + buff = self.process.stdout.read(self.bufsize) if buff: - if file_type == "gzip": - decomp_buff = dec.decompress(buff) - elif file_type == "plain": + if self.file_type == "gzip": + decomp_buff = self.dec.decompress(buff) + elif self.file_type == "plain": decomp_buff = buff else: - raise TypeError("file_type %s is not allowed" % file_type) + raise TypeError("file_type %s is not allowed" % + self.file_type) if cut_lines: lines, remained = _buf2lines(''.join( [remained, decomp_buff]), line_break) - parsed_list = parser(lines) - for ret in parsed_list: - yield ret + for line in lines: + yield line else: - for ret in parser(decomp_buff): - yield ret + yield decomp_buff else: break - - return reader diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py index 06e14796daf27812b9aeb1e4b024f294c7609991..4ba71969dffe7447b6c5b70aeb752a4e5469fb36 100644 --- a/python/paddle/v2/reader/tests/decorator_test.py +++ b/python/paddle/v2/reader/tests/decorator_test.py @@ -147,8 +147,11 @@ class TestXmap(unittest.TestCase): class TestPipeReader(unittest.TestCase): def test_pipe_reader(self): - def simple_parser(lines): - return lines + def example_reader(myfiles): + for f in myfiles: + pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128) + for l in pr.get_line(): + yield l import tempfile @@ -159,17 +162,12 @@ class TestPipeReader(unittest.TestCase): for r in records: f.write('%s\n' % r) - cmd = "cat %s" % temp.name - reader = paddle.v2.reader.pipe_reader( - cmd, simple_parser, bufsize=128) - for i in xrange(4): - result = [] - for r in reader(): - result.append(r) - - for idx, e in enumerate(records): - print e, result[idx] - self.assertEqual(e, result[idx]) + result = [] + for r in example_reader([temp.name]): + result.append(r) + + for idx, e in enumerate(records): + self.assertEqual(e, result[idx]) finally: # delete the temporary file temp.close()