Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PGL
提交
9029259d
P
PGL
项目概览
PaddlePaddle
/
PGL
通知
76
Star
4
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
11
列表
看板
标记
里程碑
合并请求
1
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PGL
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
11
Issue
11
列表
看板
标记
里程碑
合并请求
1
合并请求
1
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
9029259d
编写于
9月 22, 2020
作者:
W
Webbley
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add stream shuffle
上级
d7d96a89
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
77 addition
and
13 deletion
+77
-13
pgl/utils/data/dataloader.py
pgl/utils/data/dataloader.py
+77
-13
未找到文件。
pgl/utils/data/dataloader.py
浏览文件 @
9029259d
...
@@ -14,7 +14,7 @@
...
@@ -14,7 +14,7 @@
# limitations under the License.
# limitations under the License.
"""dataloader
"""dataloader
"""
"""
import
warnings
import
numpy
as
np
import
numpy
as
np
from
collections
import
namedtuple
from
collections
import
namedtuple
...
@@ -64,15 +64,15 @@ class Dataloader(object):
...
@@ -64,15 +64,15 @@ class Dataloader(object):
print(batch_data)
print(batch_data)
"""
"""
def
__init__
(
def
__init__
(
self
,
self
,
dataset
,
dataset
,
batch_size
=
1
,
batch_size
=
1
,
drop_last
=
False
,
drop_last
=
False
,
shuffle
=
False
,
shuffle
=
False
,
num_workers
=
1
,
num_workers
=
1
,
collate_fn
=
None
,
collate_fn
=
None
,
buf_size
=
1000
,
buf_size
=
1000
,
):
stream_shuffle_size
=
0
):
self
.
dataset
=
dataset
self
.
dataset
=
dataset
self
.
batch_size
=
batch_size
self
.
batch_size
=
batch_size
...
@@ -81,6 +81,22 @@ class Dataloader(object):
...
@@ -81,6 +81,22 @@ class Dataloader(object):
self
.
collate_fn
=
collate_fn
self
.
collate_fn
=
collate_fn
self
.
buf_size
=
buf_size
self
.
buf_size
=
buf_size
self
.
drop_last
=
drop_last
self
.
drop_last
=
drop_last
self
.
stream_shuffle_size
=
stream_shuffle_size
if
self
.
shuffle
and
isinstance
(
self
.
dataset
,
StreamDataset
):
warn_msg
=
"[shuffle] should not be True with StreamDataset. "
\
"It will be ignored. "
\
"You might want to set [stream_shuffle_size] with StreamDataset."
warnings
.
warn
(
warn_msg
)
if
self
.
stream_shuffle_size
>
0
and
self
.
batch_size
>=
stream_shuffle_size
:
raise
ValueError
(
"stream_shuffle_size must be larger than batch_size,"
\
"but got [stream_shuffle_size=%s] smaller than [batch_size=%s]"
\
%
(
self
.
stream_shuffle_size
,
self
.
batch_size
))
if
self
.
num_workers
<
1
:
raise
ValueError
(
"num_workers(default: 1) should be larger than 0, "
\
"but got [num_workers=%s] < 1."
%
self
.
num_workers
)
def
__len__
(
self
):
def
__len__
(
self
):
if
not
isinstance
(
self
.
dataset
,
StreamDataset
):
if
not
isinstance
(
self
.
dataset
,
StreamDataset
):
...
@@ -129,6 +145,8 @@ class _DataLoaderIter(object):
...
@@ -129,6 +145,8 @@ class _DataLoaderIter(object):
self
.
collate_fn
=
dataloader
.
collate_fn
self
.
collate_fn
=
dataloader
.
collate_fn
self
.
num_workers
=
dataloader
.
num_workers
self
.
num_workers
=
dataloader
.
num_workers
self
.
drop_last
=
dataloader
.
drop_last
self
.
drop_last
=
dataloader
.
drop_last
self
.
batch_size
=
dataloader
.
batch_size
self
.
stream_shuffle_size
=
dataloader
.
stream_shuffle_size
self
.
fid
=
fid
self
.
fid
=
fid
self
.
count
=
0
self
.
count
=
0
...
@@ -167,17 +185,63 @@ class _DataLoaderIter(object):
...
@@ -167,17 +185,63 @@ class _DataLoaderIter(object):
# make sure do not repeat in multiprocessing
# make sure do not repeat in multiprocessing
self
.
count
+=
1
self
.
count
+=
1
# if self.count % self.num_workers != self.fid:
# continue
if
self
.
collate_fn
is
not
None
:
if
self
.
collate_fn
is
not
None
:
yield
self
.
collate_fn
(
batch_data
)
yield
self
.
collate_fn
(
batch_data
)
else
:
else
:
yield
batch_data
yield
batch_data
def
_stream_shuffle_data_generator
(
self
):
def
_stream_shuffle_index_generator
():
shuffle_size
=
[
i
for
i
in
range
(
self
.
stream_shuffle_size
)]
while
True
:
yield
shuffle_size
def
_data_generator
():
dataset
=
iter
(
self
.
dataset
)
for
shuffle_size
in
_stream_shuffle_index_generator
():
shuffle_size_data
=
[]
for
idx
in
shuffle_size
:
try
:
shuffle_size_data
.
append
(
next
(
dataset
))
except
StopIteration
:
break
if
len
(
shuffle_size_data
)
==
0
:
break
yield
shuffle_size_data
def
_batch_data_generator
():
batch_data
=
[]
for
shuffle_size_data
in
_data_generator
():
np
.
random
.
shuffle
(
shuffle_size_data
)
for
d
in
shuffle_size_data
:
batch_data
.
append
(
d
)
if
len
(
batch_data
)
==
self
.
batch_size
:
yield
batch_data
batch_data
=
[]
if
not
self
.
drop_last
and
len
(
batch_data
)
>
0
:
yield
batch_data
self
.
_worker_info
=
WorkerInfo
(
num_workers
=
self
.
num_workers
,
fid
=
self
.
fid
)
self
.
dataset
.
_set_worker_info
(
self
.
_worker_info
)
for
batch_data
in
_batch_data_generator
():
if
self
.
collate_fn
is
not
None
:
yield
self
.
collate_fn
(
batch_data
)
else
:
yield
batch_data
def
__iter__
(
self
):
def
__iter__
(
self
):
if
isinstance
(
self
.
dataset
,
StreamDataset
):
if
isinstance
(
self
.
dataset
,
StreamDataset
):
data_generator
=
self
.
_streamdata_generator
if
self
.
stream_shuffle_size
>
0
:
data_generator
=
self
.
_stream_shuffle_data_generator
else
:
data_generator
=
self
.
_streamdata_generator
else
:
else
:
data_generator
=
self
.
_data_generator
data_generator
=
self
.
_data_generator
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录