From fbd1d1cf26fc0c0d242685c29fc89213c8151798 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 25 Dec 2017 16:36:59 +0800
Subject: [PATCH] refine pipe_reader

---
 python/paddle/v2/reader/decorator.py | 121 +++++++++++----------------
 1 file changed, 50 insertions(+), 71 deletions(-)

diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
index 27c82c95f..8262d992e 100644
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -334,93 +334,72 @@ def _buf2lines(buf, line_break="\n"):
     return lines[:-1], lines[-1]
 
 
-def pipe_reader(left_cmd,
-                parser,
-                bufsize=8192,
-                file_type="plain",
-                cut_lines=True,
-                line_break="\n"):
+class PipeReader:
     """
-    pipe_reader read data by stream from a command, take it's 
-    stdout into a pipe buffer and redirect it to the parser to
-    parse, then yield data as your desired format.
+        PipeReader read data by stream from a command, take it's 
+        stdout into a pipe buffer and redirect it to the parser to
+        parse, then yield data as your desired format.
 
-    You can using standard linux command or call another program
-    to read data, from HDFS, Ceph, URL, AWS S3 etc:
+        You can using standard linux command or call another program
+        to read data, from HDFS, Ceph, URL, AWS S3 etc:
 
-    cmd = "hadoop fs -cat /path/to/some/file"
-    cmd = "cat sample_file.tar.gz"
-    cmd = "curl http://someurl"
-    cmd = "python print_s3_bucket.py"
+        .. code-block:: python
+           cmd = "hadoop fs -cat /path/to/some/file"
+           cmd = "cat sample_file.tar.gz"
+           cmd = "curl http://someurl"
+           cmd = "python print_s3_bucket.py"
 
-    A sample parser:
+        An example:
+
+        .. code-block:: python
     
-    def sample_parser(lines):
-        # parse each line as one sample data,
-        # return a list of samples as batches.
-        ret = []
-        for l in lines:
-            ret.append(l.split(" ")[1:5])
-        return ret
-
-    :param left_cmd: command to excute to get stdout from.
-    :type left_cmd: string
-    :param parser: parser function to parse lines of data.
-                   if cut_lines is True, parser will receive list
-                   of lines.
-                   if cut_lines is False, parser will receive a
-                   raw buffer each time.
-                   parser should return a list of parsed values.
-    :type parser: callable
-    :param bufsize: the buffer size used for the stdout pipe.
-    :type bufsize: int
-    :param file_type: can be plain/gzip, stream buffer data type.
-    :type file_type: string
-    :param cut_lines: whether to pass lines instead of raw buffer
-                      to the parser
-    :type cut_lines: bool
-    :param line_break: line break of the file, like \n or \r
-    :type line_break: string
-
-    :return: the reader generator.
-    :rtype: callable
+           def example_reader():
+               for f in myfiles:
+                   pr = PipeReader("cat %s"%f)
+                   for l in pr.get_line():
+                       sample = l.split(" ")
+                       yield sample
     """
-    if not isinstance(left_cmd, str):
-        raise TypeError("left_cmd must be a string")
-    if not callable(parser):
-        raise TypeError("parser must be a callable object")
-
-    # TODO(typhoonzero): add a thread to read stderr
-
-    # Always init a decompress object is better than
-    # create in the loop.
-    dec = zlib.decompressobj(
-        32 + zlib.MAX_WBITS)  # offset 32 to skip the header
 
-    def reader():
-        process = subprocess.Popen(
-            left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+    def __init__(self, command, bufsize=8192, file_type="plain"):
+        if not isinstance(command, str):
+            raise TypeError("left_cmd must be a string")
+        if file_type == "gzip":
+            self.dec = zlib.decompressobj(
+                32 + zlib.MAX_WBITS)  # offset 32 to skip the header
+        self.file_type = file_type
+        self.bufsize = bufsize
+        self.process = subprocess.Popen(
+            command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+
+    def get_line(self, cut_lines=True, line_break="\n"):
+        """
+        :param cut_lines: cut buffer to lines
+        :type cut_lines: bool
+        :param line_break: line break of the file, like \n or \r
+        :type line_break: string
+
+        :return: one line or a buffer of bytes
+        :rtype: string
+        """
         remained = ""
         while True:
-            buff = process.stdout.read(bufsize)
+            buff = self.process.stdout.read(self.bufsize)
             if buff:
-                if file_type == "gzip":
-                    decomp_buff = dec.decompress(buff)
-                elif file_type == "plain":
+                if self.file_type == "gzip":
+                    decomp_buff = self.dec.decompress(buff)
+                elif self.file_type == "plain":
                     decomp_buff = buff
                 else:
-                    raise TypeError("file_type %s is not allowed" % file_type)
+                    raise TypeError("file_type %s is not allowed" %
+                                    self.file_type)
 
                 if cut_lines:
                     lines, remained = _buf2lines(''.join(
                         [remained, decomp_buff]), line_break)
-                    parsed_list = parser(lines)
-                    for ret in parsed_list:
-                        yield ret
+                    for line in lines:
+                        yield line
                 else:
-                    for ret in parser(decomp_buff):
-                        yield ret
+                    yield decomp_buff
             else:
                 break
-
-    return reader
-- 
GitLab