Merge pull request #6990 from typhoonzero/refine_pipe_reader

refine pipe_reader

Merge pull request #6990 from typhoonzero/refine_pipe_reader
refine pipe_reader
b4302bbb · 武毅 · GitHub · 80dafdf5 · 9b67688b · b4302bbb
隐藏空白更改
内联并排

Showing with 62 addition and 85 deletion

python/paddle/v2/reader/decorator.py python/paddle/v2/reader/decorator.py +51 -72

python/paddle/v2/reader/tests/decorator_test.py python/paddle/v2/reader/tests/decorator_test.py +11 -13

未找到文件。
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -14,7 +14,7 @@
 __all__ = [
    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers', 'pipe_reader'
+    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
 ]
 from threading import Thread
@@ -334,93 +334,72 @@ def _buf2lines(buf, line_break="\n"):
    return lines[:-1], lines[-1]
-def pipe_reader(left_cmd,
+class PipeReader:
-                parser,
-                bufsize=8192,
-                file_type="plain",
-                cut_lines=True,
-                line_break="\n"):
    """
-    pipe_reader read data by stream from a command, take it's 
+        PipeReader read data by stream from a command, take it's 
-    stdout into a pipe buffer and redirect it to the parser to
+        stdout into a pipe buffer and redirect it to the parser to
-    parse, then yield data as your desired format.
+        parse, then yield data as your desired format.
-    You can using standard linux command or call another program
+        You can using standard linux command or call another program
-    to read data, from HDFS, Ceph, URL, AWS S3 etc:
+        to read data, from HDFS, Ceph, URL, AWS S3 etc:
-    cmd = "hadoop fs -cat /path/to/some/file"
+        .. code-block:: python
-    cmd = "cat sample_file.tar.gz"
+           cmd = "hadoop fs -cat /path/to/some/file"
-    cmd = "curl http://someurl"
+           cmd = "cat sample_file.tar.gz"
-    cmd = "python print_s3_bucket.py"
+           cmd = "curl http://someurl"
+           cmd = "python print_s3_bucket.py"
-    A sample parser:
+        An example:
+        .. code-block:: python
-    def sample_parser(lines):
+           def example_reader():
-        # parse each line as one sample data,
+               for f in myfiles:
-        # return a list of samples as batches.
+                   pr = PipeReader("cat %s"%f)
-        ret = []
+                   for l in pr.get_line():
-        for l in lines:
+                       sample = l.split(" ")
-            ret.append(l.split(" ")[1:5])
+                       yield sample
-        return ret
-    :param left_cmd: command to excute to get stdout from.
-    :type left_cmd: string
-    :param parser: parser function to parse lines of data.
-                   if cut_lines is True, parser will receive list
-                   of lines.
-                   if cut_lines is False, parser will receive a
-                   raw buffer each time.
-                   parser should return a list of parsed values.
-    :type parser: callable
-    :param bufsize: the buffer size used for the stdout pipe.
-    :type bufsize: int
-    :param file_type: can be plain/gzip, stream buffer data type.
-    :type file_type: string
-    :param cut_lines: whether to pass lines instead of raw buffer
-                      to the parser
-    :type cut_lines: bool
-    :param line_break: line break of the file, like \n or \r
-    :type line_break: string
-    :return: the reader generator.
-    :rtype: callable
    """
-    if not isinstance(left_cmd, str):
-        raise TypeError("left_cmd must be a string")
-    if not callable(parser):
-        raise TypeError("parser must be a callable object")
-    # TODO(typhoonzero): add a thread to read stderr
-    # Always init a decompress object is better than
-    # create in the loop.
-    dec = zlib.decompressobj(
-        32 + zlib.MAX_WBITS)  # offset 32 to skip the header
-    def reader():
+    def __init__(self, command, bufsize=8192, file_type="plain"):
-        process = subprocess.Popen(
+        if not isinstance(command, str):
-            left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+            raise TypeError("left_cmd must be a string")
+        if file_type == "gzip":
+            self.dec = zlib.decompressobj(
+                32 + zlib.MAX_WBITS)  # offset 32 to skip the header
+        self.file_type = file_type
+        self.bufsize = bufsize
+        self.process = subprocess.Popen(
+            command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+    def get_line(self, cut_lines=True, line_break="\n"):
+        """
+        :param cut_lines: cut buffer to lines
+        :type cut_lines: bool
+        :param line_break: line break of the file, like \n or \r
+        :type line_break: string
+        :return: one line or a buffer of bytes
+        :rtype: string
+        """
        remained = ""
        while True:
-            buff = process.stdout.read(bufsize)
+            buff = self.process.stdout.read(self.bufsize)
            if buff:
-                if file_type == "gzip":
+                if self.file_type == "gzip":
-                    decomp_buff = dec.decompress(buff)
+                    decomp_buff = self.dec.decompress(buff)
-                elif file_type == "plain":
+                elif self.file_type == "plain":
                    decomp_buff = buff
                else:
-                    raise TypeError("file_type %s is not allowed" % file_type)
+                    raise TypeError("file_type %s is not allowed" %
+                                    self.file_type)
                if cut_lines:
                    lines, remained = _buf2lines(''.join(
                        [remained, decomp_buff]), line_break)
-                    parsed_list = parser(lines)
+                    for line in lines:
-                    for ret in parsed_list:
+                        yield line
-                        yield ret
                else:
-                    for ret in parser(decomp_buff):
+                    yield decomp_buff
-                        yield ret
            else:
                break
-    return reader
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -147,8 +147,11 @@ class TestXmap(unittest.TestCase):
 class TestPipeReader(unittest.TestCase):
    def test_pipe_reader(self):
-        def simple_parser(lines):
+        def example_reader(myfiles):
-            return lines
+            for f in myfiles:
+                pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128)
+                for l in pr.get_line():
+                    yield l
        import tempfile
@@ -159,17 +162,12 @@ class TestPipeReader(unittest.TestCase):
                for r in records:
                    f.write('%s\n' % r)
-            cmd = "cat %s" % temp.name
+            result = []
-            reader = paddle.v2.reader.pipe_reader(
+            for r in example_reader([temp.name]):
-                cmd, simple_parser, bufsize=128)
+                result.append(r)
-            for i in xrange(4):
-                result = []
+            for idx, e in enumerate(records):
-                for r in reader():
+                self.assertEqual(e, result[idx])
-                    result.append(r)
-                for idx, e in enumerate(records):
-                    print e, result[idx]
-                    self.assertEqual(e, result[idx])
        finally:
            # delete the temporary file
            temp.close()