Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
c45481d7
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
c45481d7
编写于
8月 19, 2020
作者:
K
Kaipeng Deng
提交者:
GitHub
8月 19, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add Sampler/SequenceSampler/RandomSampler (#26375)
* add Sampler/SequenceSampler/RandomSampler. test=develop
上级
56890dc7
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
349 addition
and
52 deletion
+349
-52
python/paddle/fluid/dataloader/__init__.py
python/paddle/fluid/dataloader/__init__.py
+5
-1
python/paddle/fluid/dataloader/batch_sampler.py
python/paddle/fluid/dataloader/batch_sampler.py
+35
-44
python/paddle/fluid/dataloader/sampler.py
python/paddle/fluid/dataloader/sampler.py
+232
-0
python/paddle/fluid/tests/unittests/test_batch_sampler.py
python/paddle/fluid/tests/unittests/test_batch_sampler.py
+72
-6
python/paddle/io/__init__.py
python/paddle/io/__init__.py
+5
-1
未找到文件。
python/paddle/fluid/dataloader/__init__.py
浏览文件 @
c45481d7
...
...
@@ -23,6 +23,10 @@ from .batch_sampler import *
from
.
import
dataloader_iter
from
.dataloader_iter
import
*
from
.
import
sampler
from
.sampler
import
*
__all__
=
dataset
.
__all__
\
+
batch_sampler
.
__all__
\
+
dataloader_iter
.
__all__
+
dataloader_iter
.
__all__
\
+
sampler
.
__all__
python/paddle/fluid/dataloader/batch_sampler.py
浏览文件 @
c45481d7
...
...
@@ -16,12 +16,13 @@ from __future__ import print_function
from
__future__
import
division
import
numpy
as
np
from
.sampler
import
Sampler
,
SequenceSampler
from
.dataset
import
Dataset
,
IterableDataset
__all__
=
[
"BatchSampler"
]
class
BatchSampler
(
object
):
class
BatchSampler
(
Sampler
):
"""
A base implement of batch sampler used by `paddle.io.DataLoader`
which yield mini-batch indices(a list/tuple with length as
...
...
@@ -41,10 +42,11 @@ class BatchSampler(object):
implement or other python object which implemented
:code:`__len__` for BatchSampler to get indices as the
range of :attr:`dataset` length. Default None.
indices (list|tuple): a substitution parameter for
:attr:`dataset` either :attr:`dataset` or
:attr:`indices` should be set, give the whole
indices to sampler from directly. Default None.
sampler (Sampler): this could be a :code:`paddle.io.Dataset`
instance which implemented :code:`__iter__` to yield
sample indices. :attr:`sampler` and :attr:`dataset`
can not be set in the same time. If :attr:`sampler`
is set, :attr:`shuffle` should not be set. Default None.
shuffle(bool): whether to shuffle indices order before genrating
batch indices. Default False.
batch_size(int): sample indice number in a mini-batch indices.
...
...
@@ -58,16 +60,7 @@ class BatchSampler(object):
.. code-block:: python
from paddle.io import BatchSampler, Dataset
# init with indices
bs = BatchSampler(indices=list(range(100)),
shuffle=True,
batch_size=8,
drop_last=True)
for batch_indices in bs:
print(batch_indices)
from paddle.io import RandomSampler, BatchSampler, Dataset
# init with dataset
class RandomDataset(Dataset):
...
...
@@ -90,34 +83,42 @@ class BatchSampler(object):
for batch_indices in bs:
print(batch_indices)
# init with sampler
sampler = RandomSampler(RandomDataset(100))
bs = BatchSampler(sampler=sampler,
shuffle=True,
batch_size=8,
drop_last=True)
for batch_indices in bs:
print(batch_indices)
see `paddle.io.DataLoader`
"""
def
__init__
(
self
,
dataset
=
None
,
indices
=
None
,
sampler
=
None
,
shuffle
=
False
,
batch_size
=
1
,
drop_last
=
False
):
if
dataset
is
None
:
assert
indices
is
not
None
,
\
"either dataset or
indices
should be set"
assert
isinstance
(
indices
,
list
)
or
isinstance
(
indices
,
tuple
),
\
"
indices should be a list or tuple, but got {}"
.
format
(
type
(
indices
))
self
.
indices
=
indices
self
.
sampler
_iter
=
None
assert
sampler
is
not
None
,
\
"either dataset or
sampler
should be set"
assert
isinstance
(
sampler
,
Sampler
),
\
"
sampler should be a paddle.io.Sampler, but got {}"
.
format
(
type
(
sampler
))
assert
not
shuffle
,
"shuffle should be False when sampler is set"
self
.
sampler
=
sampler
else
:
if
isinstance
(
dataset
,
IterableDataset
):
self
.
sampler_iter
=
iter
(
_InfiniteIterableSampler
(
dataset
,
batch_size
))
else
:
self
.
sampler_iter
=
None
assert
isinstance
(
dataset
,
Dataset
),
\
"dataset should be an instance of paddle.io.Dataset"
assert
indices
is
None
,
\
"should not set both dataset and indices"
self
.
indices
=
list
(
range
(
len
(
dataset
)))
assert
isinstance
(
dataset
,
Dataset
),
\
"dataset should be a paddle.io.Dataset"
assert
not
isinstance
(
dataset
,
IterableDataset
),
\
"dataset should not be a paddle.io.IterableDataset"
assert
sampler
is
None
,
\
"should not set both dataset and sampler"
self
.
sampler
=
SequenceSampler
(
dataset
)
assert
isinstance
(
batch_size
,
int
)
and
batch_size
>
0
,
\
"batch_size should be a positive integer, but got {}"
.
format
(
batch_size
)
...
...
@@ -130,15 +131,8 @@ class BatchSampler(object):
self
.
drop_last
=
drop_last
def
__iter__
(
self
):
if
self
.
sampler_iter
:
yield
next
(
self
.
sampler_iter
)
if
self
.
shuffle
:
np
.
random
.
shuffle
(
self
.
indices
)
_iter
=
iter
(
self
.
indices
)
batch_indices
=
[]
for
idx
in
_it
er
:
for
idx
in
self
.
sampl
er
:
batch_indices
.
append
(
idx
)
if
len
(
batch_indices
)
==
self
.
batch_size
:
yield
batch_indices
...
...
@@ -147,10 +141,7 @@ class BatchSampler(object):
yield
batch_indices
def
__len__
(
self
):
if
self
.
sampler_iter
:
raise
RuntimeError
(
"'{}' should not be called for IterableDataset"
.
format
(
'__len__'
))
num_samples
=
len
(
self
.
indices
)
num_samples
=
len
(
self
.
sampler
)
num_samples
+=
int
(
not
self
.
drop_last
)
*
(
self
.
batch_size
-
1
)
return
num_samples
//
self
.
batch_size
...
...
python/paddle/fluid/dataloader/sampler.py
0 → 100644
浏览文件 @
c45481d7
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
division
import
numpy
as
np
__all__
=
[
"Sampler"
,
"SequenceSampler"
,
"RandomSampler"
]
class
Sampler
(
object
):
"""
An abstract class to encapsulate methods and behaviors of samplers.
All sampler used by :code:`paddle.io.BatchSampler` should be a subclass
of :code:`paddle.io.Sampler`, BatchSampler subclasses should
implement following methods:
:code:`__iter__`: return sample index iterably, which iterate over indices
of dataset elements
:code:`__len__`: the number of sample in :attr:`data_source`
Args:
data_source(Dataset, optional): this could be an instance of
:code:`paddle.io.Dataset` other Python object which
implemented :code:`__len__` for Sampler to get indices
as the range of :attr:`dataset` length. Default None.
Returns:
Sampler: an iterable object for sample indices iterating
Examples:
.. code-block:: python
from paddle.io import Dataset, Sampler
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
class MySampler(Sampler):
def __init__(self, data_source):
self.data_source = data_source
def __iter__(self):
return iter(range(len(self.data_source)))
def __len__(self):
return len(self.data_source)
sampler = MySampler(data_source=RandomDataset(100))
for index in sampler:
print(index)
see `paddle.io.BatchSampler`
see `paddle.io.DataLoader`
"""
def
__init__
(
self
,
data_source
=
None
):
self
.
data_source
=
data_source
def
__iter__
(
self
):
raise
NotImplementedError
# Not define __len__ method in this base class here for __len__
# is not needed in same sence, e.g. paddle.io.IterableDataset
class
SequenceSampler
(
Sampler
):
"""
Iterate samples sequentially, yield :code:`0, 1, 2, ..., len(data_source) -1`
generally,
Args:
data_source(Dataset): dataset to sample, this could be an
instance of :code:`paddle.io.Dataset` other Python
object which implemented :code:`__len__`.
Returns:
Sampler: a Sampler yield sample index sequentially
Examples:
.. code-block:: python
from paddle.io import Dataset, SequenceSampler
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
sampler = SequenceSampler(data_source=RandomDataset(100))
for index in sampler:
print(index)
see `paddle.io.Sampler`
"""
def
__init__
(
self
,
data_source
):
self
.
data_source
=
data_source
def
__iter__
(
self
):
return
iter
(
range
(
len
(
self
.
data_source
)))
def
__len__
(
self
):
return
len
(
self
.
data_source
)
class
RandomSampler
(
Sampler
):
"""
Iterate samples randomly, yield shuffled indices, if :attr:`replacement=False`,
yield shuffled indices of the whole data souce, if :attr:`replacement=True`,
:attr:`num_samples` can set to specify the sample number to draw.
Args:
data_source(Dataset): dataset to sample, this could be an
instance of :code:`paddle.io.Dataset` other Python
object which implemented :code:`__len__`.
replacement(bool): If False, sample the whole dataset, If False,
set :attr:`num_samples` for how many sample to draw. Default False.
num_samples(int): set sample number to draw if :attr:`replacement`
is True. Default None.
generator(Generator): specify a generator to sample the data source. Default None
Returns:
Sampler: a Sampler yield sample index randomly
Examples:
.. code-block:: python
from paddle.io import Dataset, RandomSampler
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
sampler = RandomSampler(data_souce=RandomDataset(100))
for index in sampler:
print(index)
see `paddle.io.Sampler`
"""
def
__init__
(
self
,
data_source
,
replacement
=
False
,
num_samples
=
None
,
generator
=
None
):
self
.
data_source
=
data_source
self
.
replacement
=
replacement
self
.
_num_samples
=
num_samples
self
.
generator
=
generator
if
not
isinstance
(
self
.
replacement
,
bool
):
raise
TypeError
(
"expect boolean value for replacement, but got "
"replacement={}"
.
format
(
self
.
replacement
))
if
self
.
_num_samples
is
not
None
and
not
replacement
:
raise
ValueError
(
"num_samples should not be specified while replacement is False"
)
if
not
isinstance
(
self
.
num_samples
,
int
)
or
self
.
num_samples
<=
0
:
raise
ValueError
(
"num_samples should be a positive integer, "
"but got num_samples={}"
.
format
(
self
.
num_samples
))
@
property
def
num_samples
(
self
):
if
self
.
_num_samples
is
None
:
return
len
(
self
.
data_source
)
return
self
.
_num_samples
def
__iter__
(
self
):
n
=
len
(
self
.
data_source
)
if
self
.
generator
:
for
index
in
self
.
generator
:
yield
index
else
:
if
self
.
replacement
:
for
index
in
np
.
random
.
choice
(
np
.
arange
(
n
),
self
.
num_samples
,
replace
=
True
).
tolist
():
yield
index
else
:
for
index
in
np
.
random
.
choice
(
np
.
arange
(
n
),
n
,
replace
=
False
).
tolist
():
yield
index
def
__len__
(
self
):
return
self
.
num_samples
python/paddle/fluid/tests/unittests/test_batch_sampler.py
浏览文件 @
c45481d7
...
...
@@ -17,7 +17,7 @@ from __future__ import division
import
unittest
import
paddle.fluid
as
fluid
from
paddle.io
import
BatchSampler
,
Dataset
from
paddle.io
import
BatchSampler
,
Dataset
,
Sampler
,
SequenceSampler
,
RandomSampler
class
RandomDataset
(
Dataset
):
...
...
@@ -35,6 +35,60 @@ class RandomDataset(Dataset):
return
self
.
sample_num
class
TestSampler
(
unittest
.
TestCase
):
def
test_main
(
self
):
dataset
=
RandomDataset
(
100
,
10
)
sampler
=
Sampler
(
dataset
)
try
:
iter
(
sampler
)
self
.
assertTrue
(
False
)
except
NotImplementedError
:
pass
class
TestSequenceSampler
(
unittest
.
TestCase
):
def
test_main
(
self
):
dataset
=
RandomDataset
(
100
,
10
)
sampler
=
SequenceSampler
(
dataset
)
assert
len
(
sampler
)
==
100
for
i
,
index
in
enumerate
(
iter
(
sampler
)):
assert
i
==
index
class
TestRandomSampler
(
unittest
.
TestCase
):
def
test_main
(
self
):
dataset
=
RandomDataset
(
100
,
10
)
sampler
=
RandomSampler
(
dataset
)
assert
len
(
sampler
)
==
100
rets
=
[]
for
i
in
iter
(
sampler
):
rets
.
append
(
i
)
assert
tuple
(
sorted
(
rets
))
==
tuple
(
range
(
0
,
100
))
def
test_with_num_samples
(
self
):
dataset
=
RandomDataset
(
100
,
10
)
sampler
=
RandomSampler
(
dataset
,
num_samples
=
50
,
replacement
=
True
)
assert
len
(
sampler
)
==
50
rets
=
[]
for
i
in
iter
(
sampler
):
rets
.
append
(
i
)
assert
i
>=
0
and
i
<
100
def
test_with_generator
(
self
):
dataset
=
RandomDataset
(
100
,
10
)
generator
=
iter
(
range
(
0
,
60
))
sampler
=
RandomSampler
(
dataset
,
generator
=
generator
)
assert
len
(
sampler
)
==
100
rets
=
[]
for
i
in
iter
(
sampler
):
rets
.
append
(
i
)
assert
tuple
(
sorted
(
rets
))
==
tuple
(
range
(
0
,
60
))
class
TestBatchSampler
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
num_samples
=
1000
...
...
@@ -86,16 +140,18 @@ class TestBatchSamplerShuffle(TestBatchSampler):
self
.
drop_last
=
True
class
TestBatchSamplerWith
Indices
(
TestBatchSampler
):
class
TestBatchSamplerWith
Sampler
(
TestBatchSampler
):
def
init_batch_sampler
(
self
):
dataset
=
RandomDataset
(
1000
,
10
)
sampler
=
SequenceSampler
(
dataset
)
bs
=
BatchSampler
(
indices
=
list
(
range
(
self
.
num_samples
))
,
sampler
=
sampler
,
batch_size
=
self
.
batch_size
,
drop_last
=
self
.
drop_last
)
return
bs
class
TestBatchSamplerWith
IndicesAndDataSource
(
unittest
.
TestCase
):
class
TestBatchSamplerWith
SamplerDropLast
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
num_samples
=
1000
self
.
num_classes
=
10
...
...
@@ -103,12 +159,22 @@ class TestBatchSamplerWithIndicesAndDataSource(unittest.TestCase):
self
.
shuffle
=
False
self
.
drop_last
=
True
class
TestBatchSamplerWithSamplerShuffle
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
num_samples
=
1000
self
.
num_classes
=
10
self
.
batch_size
=
32
self
.
shuffle
=
True
self
.
drop_last
=
True
def
test_main
(
self
):
try
:
dataset
=
RandomDataset
(
self
.
num_samples
,
self
.
num_classes
)
sampler
=
RandomSampler
(
dataset
)
bs
=
BatchSampler
(
dataset
=
dataset
,
indices
=
list
(
range
(
self
.
num_samples
))
,
sampler
=
sampler
,
shuffle
=
self
.
shuffle
,
batch_size
=
self
.
batch_size
,
drop_last
=
self
.
drop_last
)
self
.
assertTrue
(
False
)
...
...
python/paddle/io/__init__.py
浏览文件 @
c45481d7
...
...
@@ -20,6 +20,9 @@ __all__ = [
# 'Transform',
'DataLoader'
,
'get_worker_info'
,
'Sampler'
,
'SequenceSampler'
,
'RandomSampler'
,
'load'
,
'save'
,
'load_program_state'
,
...
...
@@ -38,7 +41,8 @@ __all__ = [
]
from
..fluid.io
import
DataLoader
from
..fluid.dataloader
import
Dataset
,
IterableDataset
,
BatchSampler
,
get_worker_info
from
..fluid.dataloader
import
Dataset
,
IterableDataset
,
BatchSampler
,
get_worker_info
,
\
Sampler
,
SequenceSampler
,
RandomSampler
from
..fluid.io
import
load
,
save
,
load_program_state
,
set_program_state
,
\
load_inference_model
,
save_inference_model
,
batch
from
..reader
import
shuffle
,
buffered
,
cache
,
chain
,
firstn
,
compose
,
map_readers
,
xmap_readers
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录