Merge pull request #6990 from typhoonzero/refine_pipe_reader

refine pipe_reader

Merge pull request #6990 from typhoonzero/refine_pipe_reader
refine pipe_reader
b4302bbb · 武毅 · GitHub · 80dafdf5 · 9b67688b · b4302bbb
Showing with 62 addition and 85 deletion

python/paddle/v2/reader/decorator.py python/paddle/v2/reader/decorator.py +51 -72

python/paddle/v2/reader/tests/decorator_test.py python/paddle/v2/reader/tests/decorator_test.py +11 -13

未找到文件。
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -14,7 +14,7 @@
 __all__ = [
    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers', 'pipe_reader'
+    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
 ]
 from threading import Thread
@@ -334,93 +334,72 @@ def _buf2lines(buf, line_break="\n"):
    return lines[:-1], lines[-1]
-def pipe_reader(left_cmd,
+class PipeReader:
-                parser,
-                bufsize=8192,
-                file_type="plain",
-                cut_lines=True,
-                line_break="\n"):
    """
-    pipe_reader read data by stream from a command, take it's 
+        PipeReader read data by stream from a command, take it's 
        stdout into a pipe buffer and redirect it to the parser to
        parse, then yield data as your desired format.
        You can using standard linux command or call another program
        to read data, from HDFS, Ceph, URL, AWS S3 etc:
+        .. code-block:: python
           cmd = "hadoop fs -cat /path/to/some/file"
           cmd = "cat sample_file.tar.gz"
           cmd = "curl http://someurl"
           cmd = "python print_s3_bucket.py"
-    A sample parser:
+        An example:
-    def sample_parser(lines):
-        # parse each line as one sample data,
-        # return a list of samples as batches.
-        ret = []
-        for l in lines:
-            ret.append(l.split(" ")[1:5])
-        return ret
-    :param left_cmd: command to excute to get stdout from.
-    :type left_cmd: string
-    :param parser: parser function to parse lines of data.
-                   if cut_lines is True, parser will receive list
-                   of lines.
-                   if cut_lines is False, parser will receive a
-                   raw buffer each time.
-                   parser should return a list of parsed values.
-    :type parser: callable
-    :param bufsize: the buffer size used for the stdout pipe.
-    :type bufsize: int
-    :param file_type: can be plain/gzip, stream buffer data type.
-    :type file_type: string
-    :param cut_lines: whether to pass lines instead of raw buffer
-                      to the parser
-    :type cut_lines: bool
-    :param line_break: line break of the file, like \n or \r
-    :type line_break: string
-    :return: the reader generator.
+        .. code-block:: python
-    :rtype: callable
-    """
-    if not isinstance(left_cmd, str):
-        raise TypeError("left_cmd must be a string")
-    if not callable(parser):
-        raise TypeError("parser must be a callable object")
-    # TODO(typhoonzero): add a thread to read stderr
+           def example_reader():
+               for f in myfiles:
+                   pr = PipeReader("cat %s"%f)
+                   for l in pr.get_line():
+                       sample = l.split(" ")
+                       yield sample
+    """
-    # Always init a decompress object is better than
+    def __init__(self, command, bufsize=8192, file_type="plain"):
-    # create in the loop.
+        if not isinstance(command, str):
-    dec = zlib.decompressobj(
+            raise TypeError("left_cmd must be a string")
+        if file_type == "gzip":
+            self.dec = zlib.decompressobj(
                32 + zlib.MAX_WBITS)  # offset 32 to skip the header
+        self.file_type = file_type
+        self.bufsize = bufsize
+        self.process = subprocess.Popen(
+            command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
-    def reader():
+    def get_line(self, cut_lines=True, line_break="\n"):
-        process = subprocess.Popen(
+        """
-            left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+        :param cut_lines: cut buffer to lines
+        :type cut_lines: bool
+        :param line_break: line break of the file, like \n or \r
+        :type line_break: string
+        :return: one line or a buffer of bytes
+        :rtype: string
+        """
        remained = ""
        while True:
-            buff = process.stdout.read(bufsize)
+            buff = self.process.stdout.read(self.bufsize)
            if buff:
-                if file_type == "gzip":
+                if self.file_type == "gzip":
-                    decomp_buff = dec.decompress(buff)
+                    decomp_buff = self.dec.decompress(buff)
-                elif file_type == "plain":
+                elif self.file_type == "plain":
                    decomp_buff = buff
                else:
-                    raise TypeError("file_type %s is not allowed" % file_type)
+                    raise TypeError("file_type %s is not allowed" %
+                                    self.file_type)
                if cut_lines:
                    lines, remained = _buf2lines(''.join(
                        [remained, decomp_buff]), line_break)
-                    parsed_list = parser(lines)
+                    for line in lines:
-                    for ret in parsed_list:
+                        yield line
-                        yield ret
                else:
-                    for ret in parser(decomp_buff):
+                    yield decomp_buff
-                        yield ret
            else:
                break
-    return reader
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -147,8 +147,11 @@ class TestXmap(unittest.TestCase):
 class TestPipeReader(unittest.TestCase):
    def test_pipe_reader(self):
-        def simple_parser(lines):
+        def example_reader(myfiles):
-            return lines
+            for f in myfiles:
+                pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128)
+                for l in pr.get_line():
+                    yield l
        import tempfile
@@ -159,16 +162,11 @@ class TestPipeReader(unittest.TestCase):
                for r in records:
                    f.write('%s\n' % r)
-            cmd = "cat %s" % temp.name
-            reader = paddle.v2.reader.pipe_reader(
-                cmd, simple_parser, bufsize=128)
-            for i in xrange(4):
            result = []
-                for r in reader():
+            for r in example_reader([temp.name]):
                result.append(r)
            for idx, e in enumerate(records):
-                    print e, result[idx]
                self.assertEqual(e, result[idx])
        finally:
            # delete the temporary file