From fbd1d1cf26fc0c0d242685c29fc89213c8151798 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Mon, 25 Dec 2017 16:36:59 +0800 Subject: [PATCH] refine pipe_reader --- python/paddle/v2/reader/decorator.py | 121 +++++++++++---------------- 1 file changed, 50 insertions(+), 71 deletions(-) diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index 27c82c95f..8262d992e 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -334,93 +334,72 @@ def _buf2lines(buf, line_break="\n"): return lines[:-1], lines[-1] -def pipe_reader(left_cmd, - parser, - bufsize=8192, - file_type="plain", - cut_lines=True, - line_break="\n"): +class PipeReader: """ - pipe_reader read data by stream from a command, take it's - stdout into a pipe buffer and redirect it to the parser to - parse, then yield data as your desired format. + PipeReader read data by stream from a command, take it's + stdout into a pipe buffer and redirect it to the parser to + parse, then yield data as your desired format. - You can using standard linux command or call another program - to read data, from HDFS, Ceph, URL, AWS S3 etc: + You can using standard linux command or call another program + to read data, from HDFS, Ceph, URL, AWS S3 etc: - cmd = "hadoop fs -cat /path/to/some/file" - cmd = "cat sample_file.tar.gz" - cmd = "curl http://someurl" - cmd = "python print_s3_bucket.py" + .. code-block:: python + cmd = "hadoop fs -cat /path/to/some/file" + cmd = "cat sample_file.tar.gz" + cmd = "curl http://someurl" + cmd = "python print_s3_bucket.py" - A sample parser: + An example: + + .. code-block:: python - def sample_parser(lines): - # parse each line as one sample data, - # return a list of samples as batches. - ret = [] - for l in lines: - ret.append(l.split(" ")[1:5]) - return ret - - :param left_cmd: command to excute to get stdout from. - :type left_cmd: string - :param parser: parser function to parse lines of data. - if cut_lines is True, parser will receive list - of lines. - if cut_lines is False, parser will receive a - raw buffer each time. - parser should return a list of parsed values. - :type parser: callable - :param bufsize: the buffer size used for the stdout pipe. - :type bufsize: int - :param file_type: can be plain/gzip, stream buffer data type. - :type file_type: string - :param cut_lines: whether to pass lines instead of raw buffer - to the parser - :type cut_lines: bool - :param line_break: line break of the file, like \n or \r - :type line_break: string - - :return: the reader generator. - :rtype: callable + def example_reader(): + for f in myfiles: + pr = PipeReader("cat %s"%f) + for l in pr.get_line(): + sample = l.split(" ") + yield sample """ - if not isinstance(left_cmd, str): - raise TypeError("left_cmd must be a string") - if not callable(parser): - raise TypeError("parser must be a callable object") - - # TODO(typhoonzero): add a thread to read stderr - - # Always init a decompress object is better than - # create in the loop. - dec = zlib.decompressobj( - 32 + zlib.MAX_WBITS) # offset 32 to skip the header - def reader(): - process = subprocess.Popen( - left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE) + def __init__(self, command, bufsize=8192, file_type="plain"): + if not isinstance(command, str): + raise TypeError("left_cmd must be a string") + if file_type == "gzip": + self.dec = zlib.decompressobj( + 32 + zlib.MAX_WBITS) # offset 32 to skip the header + self.file_type = file_type + self.bufsize = bufsize + self.process = subprocess.Popen( + command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE) + + def get_line(self, cut_lines=True, line_break="\n"): + """ + :param cut_lines: cut buffer to lines + :type cut_lines: bool + :param line_break: line break of the file, like \n or \r + :type line_break: string + + :return: one line or a buffer of bytes + :rtype: string + """ remained = "" while True: - buff = process.stdout.read(bufsize) + buff = self.process.stdout.read(self.bufsize) if buff: - if file_type == "gzip": - decomp_buff = dec.decompress(buff) - elif file_type == "plain": + if self.file_type == "gzip": + decomp_buff = self.dec.decompress(buff) + elif self.file_type == "plain": decomp_buff = buff else: - raise TypeError("file_type %s is not allowed" % file_type) + raise TypeError("file_type %s is not allowed" % + self.file_type) if cut_lines: lines, remained = _buf2lines(''.join( [remained, decomp_buff]), line_break) - parsed_list = parser(lines) - for ret in parsed_list: - yield ret + for line in lines: + yield line else: - for ret in parser(decomp_buff): - yield ret + yield decomp_buff else: break - - return reader -- GitLab