Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
50d3117d
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
50d3117d
编写于
12月 09, 2020
作者:
J
joejiong
提交者:
GitHub
12月 09, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add random_split and Subset dataset (#29291)
As the title
上级
87e75a77
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
242 addition
and
19 deletion
+242
-19
python/paddle/fluid/dataloader/dataset.py
python/paddle/fluid/dataloader/dataset.py
+129
-1
python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
...d/tests/unittests/test_multiprocess_dataloader_dataset.py
+110
-17
python/paddle/io/__init__.py
python/paddle/io/__init__.py
+3
-1
未找到文件。
python/paddle/fluid/dataloader/dataset.py
100644 → 100755
浏览文件 @
50d3117d
...
...
@@ -19,7 +19,7 @@ import paddle.dataset.common
__all__
=
[
"Dataset"
,
"IterableDataset"
,
"TensorDataset"
,
"ComposeDataset"
,
"ChainDataset"
"ChainDataset"
,
"random_split"
,
"Subset"
]
...
...
@@ -400,3 +400,131 @@ class ChainDataset(IterableDataset):
for
dataset
in
self
.
datasets
:
for
sample
in
dataset
:
yield
sample
class
Subset
(
Dataset
):
"""
Subset of a dataset at specified indices.
Args:
dataset (Dataset): The whole Dataset.
indices (sequence): Indices in the whole set selected for subset.
Returns:
Dataset: A Dataset which is the subset of the original dataset.
Example code:
.. code-block:: python
import paddle
from paddle.io import Subset
# example 1:
a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2])
print(list(a))
# [1, 3]
# example 2:
b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1])
print(list(b))
# [2, 2]
"""
def
__init__
(
self
,
dataset
,
indices
):
self
.
dataset
=
dataset
self
.
indices
=
indices
def
__getitem__
(
self
,
idx
):
return
self
.
dataset
[
self
.
indices
[
idx
]]
def
__len__
(
self
):
return
len
(
self
.
indices
)
def
random_split
(
dataset
,
lengths
,
generator
=
None
):
"""
Randomly split a dataset into non-overlapping new datasets of given lengths.
Optionally fix the generator for reproducible results, e.g.:
Args:
dataset (Dataset): Dataset to be split
lengths (sequence): lengths of splits to be produced
generator (Generator, optional): Generator used for the random permutation. Default is None then the DefaultGenerator is used in manual_seed().
Returns:
Datasets: A list of subset Datasets, which are the non-overlapping subsets of the original Dataset.
Example code:
.. code-block:: python
import paddle
from paddle.io import random_split
a_list = paddle.io.random_split(range(10), [3, 7])
print(len(a_list))
# 2
for idx, v in enumerate(a_list[0]):
print(idx, v)
# output of the first subset
# 0 1
# 1 3
# 2 9
for idx, v in enumerate(a_list[1]):
print(idx, v)
# output of the second subset
# 0 5
# 1 7
# 2 8
# 3 6
# 4 0
# 5 2
# 6 4
"""
# Cannot verify that dataset is Sized
if
sum
(
lengths
)
!=
len
(
dataset
):
# type: ignore
raise
ValueError
(
"Sum of input lengths does not equal the length of the input dataset!"
)
# TODO(@Joejiong): support Variable or Tensor type with .tolist class member function.
# For example var.item() and var.tolist()
indices
=
paddle
.
randperm
(
sum
(
lengths
)).
numpy
().
tolist
()
return
[
Subset
(
dataset
,
indices
[
offset
-
length
:
offset
])
for
offset
,
length
in
zip
(
_accumulate
(
lengths
),
lengths
)
]
def
_accumulate
(
iterable
,
fn
=
lambda
x
,
y
:
x
+
y
):
"""
Return running totals
Args:
iterable: any iterable object for example dataset.
y (x): one element in the iterable object.
fn (x, y): Defaults to lambdax.
Yields:
yields total from beginning iterator to current iterator.
Example code:
.. code-block:: python
_accumulate([1,2,3,4,5]) --> 1 3 6 10 15
_accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
"""
it
=
iter
(
iterable
)
try
:
total
=
next
(
it
)
except
StopIteration
:
return
yield
total
for
element
in
it
:
total
=
fn
(
total
,
element
)
yield
total
python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
100644 → 100755
浏览文件 @
50d3117d
...
...
@@ -20,8 +20,7 @@ import numpy as np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.io
import
Dataset
,
IterableDataset
,
TensorDataset
,
\
ComposeDataset
,
ChainDataset
,
DataLoader
from
paddle.fluid.dygraph.base
import
to_variable
ComposeDataset
,
ChainDataset
,
DataLoader
,
random_split
,
Subset
IMAGE_SIZE
=
32
...
...
@@ -54,14 +53,14 @@ class RandomIterableDataset(IterableDataset):
class
TestTensorDataset
(
unittest
.
TestCase
):
def
run_main
(
self
,
num_workers
,
places
):
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
place
=
fluid
.
CPUPlace
()
paddle
.
static
.
default_startup_program
().
random_seed
=
1
paddle
.
static
.
default_main_program
().
random_seed
=
1
place
=
paddle
.
CPUPlace
()
with
fluid
.
dygraph
.
guard
(
place
):
input_np
=
np
.
random
.
random
([
16
,
3
,
4
]).
astype
(
'float32'
)
input
=
to_variable
(
input_np
)
input
=
paddle
.
to_tensor
(
input_np
)
label_np
=
np
.
random
.
random
([
16
,
1
]).
astype
(
'int32'
)
label
=
to_variable
(
label_np
)
label
=
paddle
.
to_tensor
(
label_np
)
dataset
=
TensorDataset
([
input
,
label
])
assert
len
(
dataset
)
==
16
...
...
@@ -83,17 +82,17 @@ class TestTensorDataset(unittest.TestCase):
assert
np
.
allclose
(
label
.
numpy
(),
label_np
[
i
])
def
test_main
(
self
):
places
=
[
fluid
.
CPUPlace
()]
if
fluid
.
cor
e
.
is_compiled_with_cuda
():
places
.
append
(
fluid
.
CUDAPlace
(
0
))
places
=
[
paddle
.
CPUPlace
()]
if
paddl
e
.
is_compiled_with_cuda
():
places
.
append
(
paddle
.
CUDAPlace
(
0
))
for
p
in
places
:
self
.
run_main
(
num_workers
=
0
,
places
=
p
)
class
TestComposeDataset
(
unittest
.
TestCase
):
def
test_main
(
self
):
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
paddle
.
static
.
default_startup_program
().
random_seed
=
1
paddle
.
static
.
default_main_program
().
random_seed
=
1
dataset1
=
RandomDataset
(
10
)
dataset2
=
RandomDataset
(
10
)
...
...
@@ -110,10 +109,104 @@ class TestComposeDataset(unittest.TestCase):
assert
np
.
allclose
(
label2
,
label2_t
)
class
TestRandomSplitApi
(
unittest
.
TestCase
):
def
test_main
(
self
):
paddle
.
static
.
default_startup_program
().
random_seed
=
1
paddle
.
static
.
default_main_program
().
random_seed
=
1
dataset1
,
dataset2
=
paddle
.
io
.
random_split
(
range
(
5
),
[
1
,
4
])
self
.
assertTrue
(
len
(
dataset1
)
==
1
)
self
.
assertTrue
(
len
(
dataset2
)
==
4
)
elements_list
=
list
(
range
(
5
))
for
_
,
val
in
enumerate
(
dataset1
):
elements_list
.
remove
(
val
)
for
_
,
val
in
enumerate
(
dataset2
):
elements_list
.
remove
(
val
)
self
.
assertTrue
(
len
(
elements_list
)
==
0
)
class
TestRandomSplitError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
paddle
.
static
.
default_startup_program
().
random_seed
=
1
paddle
.
static
.
default_main_program
().
random_seed
=
1
self
.
assertRaises
(
ValueError
,
paddle
.
io
.
random_split
,
range
(
5
),
[
3
,
8
])
self
.
assertRaises
(
ValueError
,
paddle
.
io
.
random_split
,
range
(
5
),
[
8
])
self
.
assertRaises
(
ValueError
,
paddle
.
io
.
random_split
,
range
(
5
),
[])
class
TestSubsetDataset
(
unittest
.
TestCase
):
def
run_main
(
self
,
num_workers
,
places
):
paddle
.
static
.
default_startup_program
().
random_seed
=
1
paddle
.
static
.
default_main_program
().
random_seed
=
1
input_np
=
np
.
random
.
random
([
5
,
3
,
4
]).
astype
(
'float32'
)
input
=
paddle
.
to_tensor
(
input_np
)
label_np
=
np
.
random
.
random
([
5
,
1
]).
astype
(
'int32'
)
label
=
paddle
.
to_tensor
(
label_np
)
dataset
=
TensorDataset
([
input
,
label
])
even_subset
=
paddle
.
io
.
Subset
(
dataset
,
[
0
,
2
,
4
])
odd_subset
=
paddle
.
io
.
Subset
(
dataset
,
[
1
,
3
])
assert
len
(
dataset
)
==
5
def
prepare_dataloader
(
dataset
):
return
DataLoader
(
dataset
,
places
=
places
,
num_workers
=
num_workers
,
batch_size
=
1
,
drop_last
=
True
)
dataloader
=
prepare_dataloader
(
dataset
)
dataloader_even
=
prepare_dataloader
(
even_subset
)
dataloader_odd
=
prepare_dataloader
(
odd_subset
)
def
assert_basic
(
input
,
label
):
assert
len
(
input
)
==
1
assert
len
(
label
)
==
1
assert
input
.
shape
==
[
1
,
3
,
4
]
assert
label
.
shape
==
[
1
,
1
]
assert
isinstance
(
input
,
paddle
.
Tensor
)
assert
isinstance
(
label
,
paddle
.
Tensor
)
elements_list
=
list
()
for
_
,
(
input
,
label
)
in
enumerate
(
dataloader
()):
assert_basic
(
input
,
label
)
elements_list
.
append
(
label
)
for
_
,
(
input
,
label
)
in
enumerate
(
dataloader_even
()):
assert_basic
(
input
,
label
)
elements_list
.
remove
(
label
)
odd_list
=
list
()
for
_
,
(
input
,
label
)
in
enumerate
(
dataloader_odd
()):
assert_basic
(
input
,
label
)
odd_list
.
append
(
label
)
self
.
assertEqual
(
odd_list
,
elements_list
)
def
test_main
(
self
):
paddle
.
static
.
default_startup_program
().
random_seed
=
1
paddle
.
static
.
default_main_program
().
random_seed
=
1
places
=
[
paddle
.
CPUPlace
()]
if
paddle
.
is_compiled_with_cuda
():
places
.
append
(
paddle
.
CUDAPlace
(
0
))
for
p
in
places
:
self
.
run_main
(
num_workers
=
0
,
places
=
p
)
class
TestChainDataset
(
unittest
.
TestCase
):
def
run_main
(
self
,
num_workers
,
places
):
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
paddle
.
static
.
default_startup_program
().
random_seed
=
1
paddle
.
static
.
default_main_program
().
random_seed
=
1
dataset1
=
RandomIterableDataset
(
10
)
dataset2
=
RandomIterableDataset
(
10
)
...
...
@@ -135,9 +228,9 @@ class TestChainDataset(unittest.TestCase):
idx
+=
1
def
test_main
(
self
):
places
=
[
fluid
.
CPUPlace
()]
if
fluid
.
cor
e
.
is_compiled_with_cuda
():
places
.
append
(
fluid
.
CUDAPlace
(
0
))
places
=
[
paddle
.
CPUPlace
()]
if
paddl
e
.
is_compiled_with_cuda
():
places
.
append
(
paddle
.
CUDAPlace
(
0
))
for
p
in
places
:
self
.
run_main
(
num_workers
=
0
,
places
=
p
)
...
...
python/paddle/io/__init__.py
100644 → 100755
浏览文件 @
50d3117d
...
...
@@ -28,9 +28,11 @@ __all__ = [
'SequenceSampler'
,
'RandomSampler'
,
'WeightedRandomSampler'
,
'random_split'
,
'Subset'
]
from
..fluid.io
import
DataLoader
from
..fluid.dataloader
import
Dataset
,
IterableDataset
,
BatchSampler
,
get_worker_info
,
\
TensorDataset
,
Sampler
,
SequenceSampler
,
RandomSampler
,
DistributedBatchSampler
,
\
ComposeDataset
,
ChainDataset
,
WeightedRandomSampler
ComposeDataset
,
ChainDataset
,
WeightedRandomSampler
,
Subset
,
random_split
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录