Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
b4302bbb
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b4302bbb
编写于
12月 26, 2017
作者:
武
武毅
提交者:
GitHub
12月 26, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #6990 from typhoonzero/refine_pipe_reader
refine pipe_reader
上级
80dafdf5
9b67688b
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
62 addition
and
85 deletion
+62
-85
python/paddle/v2/reader/decorator.py
python/paddle/v2/reader/decorator.py
+51
-72
python/paddle/v2/reader/tests/decorator_test.py
python/paddle/v2/reader/tests/decorator_test.py
+11
-13
未找到文件。
python/paddle/v2/reader/decorator.py
浏览文件 @
b4302bbb
...
...
@@ -14,7 +14,7 @@
__all__
=
[
'map_readers'
,
'buffered'
,
'compose'
,
'chain'
,
'shuffle'
,
'ComposeNotAligned'
,
'firstn'
,
'xmap_readers'
,
'
pipe_r
eader'
'ComposeNotAligned'
,
'firstn'
,
'xmap_readers'
,
'
PipeR
eader'
]
from
threading
import
Thread
...
...
@@ -334,93 +334,72 @@ def _buf2lines(buf, line_break="\n"):
return
lines
[:
-
1
],
lines
[
-
1
]
def
pipe_reader
(
left_cmd
,
parser
,
bufsize
=
8192
,
file_type
=
"plain"
,
cut_lines
=
True
,
line_break
=
"
\n
"
):
class
PipeReader
:
"""
pipe_r
eader read data by stream from a command, take it's
stdout into a pipe buffer and redirect it to the parser to
parse, then yield data as your desired format.
PipeR
eader read data by stream from a command, take it's
stdout into a pipe buffer and redirect it to the parser to
parse, then yield data as your desired format.
You can using standard linux command or call another program
to read data, from HDFS, Ceph, URL, AWS S3 etc:
You can using standard linux command or call another program
to read data, from HDFS, Ceph, URL, AWS S3 etc:
cmd = "hadoop fs -cat /path/to/some/file"
cmd = "cat sample_file.tar.gz"
cmd = "curl http://someurl"
cmd = "python print_s3_bucket.py"
.. code-block:: python
cmd = "hadoop fs -cat /path/to/some/file"
cmd = "cat sample_file.tar.gz"
cmd = "curl http://someurl"
cmd = "python print_s3_bucket.py"
A sample parser:
An example:
.. code-block:: python
def sample_parser(lines):
# parse each line as one sample data,
# return a list of samples as batches.
ret = []
for l in lines:
ret.append(l.split(" ")[1:5])
return ret
:param left_cmd: command to excute to get stdout from.
:type left_cmd: string
:param parser: parser function to parse lines of data.
if cut_lines is True, parser will receive list
of lines.
if cut_lines is False, parser will receive a
raw buffer each time.
parser should return a list of parsed values.
:type parser: callable
:param bufsize: the buffer size used for the stdout pipe.
:type bufsize: int
:param file_type: can be plain/gzip, stream buffer data type.
:type file_type: string
:param cut_lines: whether to pass lines instead of raw buffer
to the parser
:type cut_lines: bool
:param line_break: line break of the file, like
\n
or
\r
:type line_break: string
:return: the reader generator.
:rtype: callable
def example_reader():
for f in myfiles:
pr = PipeReader("cat %s"%f)
for l in pr.get_line():
sample = l.split(" ")
yield sample
"""
if
not
isinstance
(
left_cmd
,
str
):
raise
TypeError
(
"left_cmd must be a string"
)
if
not
callable
(
parser
):
raise
TypeError
(
"parser must be a callable object"
)
# TODO(typhoonzero): add a thread to read stderr
# Always init a decompress object is better than
# create in the loop.
dec
=
zlib
.
decompressobj
(
32
+
zlib
.
MAX_WBITS
)
# offset 32 to skip the header
def
reader
():
process
=
subprocess
.
Popen
(
left_cmd
.
split
(
" "
),
bufsize
=
bufsize
,
stdout
=
subprocess
.
PIPE
)
def
__init__
(
self
,
command
,
bufsize
=
8192
,
file_type
=
"plain"
):
if
not
isinstance
(
command
,
str
):
raise
TypeError
(
"left_cmd must be a string"
)
if
file_type
==
"gzip"
:
self
.
dec
=
zlib
.
decompressobj
(
32
+
zlib
.
MAX_WBITS
)
# offset 32 to skip the header
self
.
file_type
=
file_type
self
.
bufsize
=
bufsize
self
.
process
=
subprocess
.
Popen
(
command
.
split
(
" "
),
bufsize
=
bufsize
,
stdout
=
subprocess
.
PIPE
)
def
get_line
(
self
,
cut_lines
=
True
,
line_break
=
"
\n
"
):
"""
:param cut_lines: cut buffer to lines
:type cut_lines: bool
:param line_break: line break of the file, like
\n
or
\r
:type line_break: string
:return: one line or a buffer of bytes
:rtype: string
"""
remained
=
""
while
True
:
buff
=
process
.
stdout
.
read
(
bufsize
)
buff
=
self
.
process
.
stdout
.
read
(
self
.
bufsize
)
if
buff
:
if
file_type
==
"gzip"
:
decomp_buff
=
dec
.
decompress
(
buff
)
elif
file_type
==
"plain"
:
if
self
.
file_type
==
"gzip"
:
decomp_buff
=
self
.
dec
.
decompress
(
buff
)
elif
self
.
file_type
==
"plain"
:
decomp_buff
=
buff
else
:
raise
TypeError
(
"file_type %s is not allowed"
%
file_type
)
raise
TypeError
(
"file_type %s is not allowed"
%
self
.
file_type
)
if
cut_lines
:
lines
,
remained
=
_buf2lines
(
''
.
join
(
[
remained
,
decomp_buff
]),
line_break
)
parsed_list
=
parser
(
lines
)
for
ret
in
parsed_list
:
yield
ret
for
line
in
lines
:
yield
line
else
:
for
ret
in
parser
(
decomp_buff
):
yield
ret
yield
decomp_buff
else
:
break
return
reader
python/paddle/v2/reader/tests/decorator_test.py
浏览文件 @
b4302bbb
...
...
@@ -147,8 +147,11 @@ class TestXmap(unittest.TestCase):
class
TestPipeReader
(
unittest
.
TestCase
):
def
test_pipe_reader
(
self
):
def
simple_parser
(
lines
):
return
lines
def
example_reader
(
myfiles
):
for
f
in
myfiles
:
pr
=
paddle
.
v2
.
reader
.
PipeReader
(
"cat %s"
%
f
,
bufsize
=
128
)
for
l
in
pr
.
get_line
():
yield
l
import
tempfile
...
...
@@ -159,17 +162,12 @@ class TestPipeReader(unittest.TestCase):
for
r
in
records
:
f
.
write
(
'%s
\n
'
%
r
)
cmd
=
"cat %s"
%
temp
.
name
reader
=
paddle
.
v2
.
reader
.
pipe_reader
(
cmd
,
simple_parser
,
bufsize
=
128
)
for
i
in
xrange
(
4
):
result
=
[]
for
r
in
reader
():
result
.
append
(
r
)
for
idx
,
e
in
enumerate
(
records
):
print
e
,
result
[
idx
]
self
.
assertEqual
(
e
,
result
[
idx
])
result
=
[]
for
r
in
example_reader
([
temp
.
name
]):
result
.
append
(
r
)
for
idx
,
e
in
enumerate
(
records
):
self
.
assertEqual
(
e
,
result
[
idx
])
finally
:
# delete the temporary file
temp
.
close
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录