Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
50d3117d
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
50d3117d
编写于
12月 09, 2020
作者:
J
joejiong
提交者:
GitHub
12月 09, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add random_split and Subset dataset (#29291)
As the title
上级
87e75a77
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
242 addition
and
19 deletion
+242
-19
python/paddle/fluid/dataloader/dataset.py
python/paddle/fluid/dataloader/dataset.py
+129
-1
python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
...d/tests/unittests/test_multiprocess_dataloader_dataset.py
+110
-17
python/paddle/io/__init__.py
python/paddle/io/__init__.py
+3
-1
未找到文件。
python/paddle/fluid/dataloader/dataset.py
100644 → 100755
浏览文件 @
50d3117d
...
@@ -19,7 +19,7 @@ import paddle.dataset.common
...
@@ -19,7 +19,7 @@ import paddle.dataset.common
__all__
=
[
__all__
=
[
"Dataset"
,
"IterableDataset"
,
"TensorDataset"
,
"ComposeDataset"
,
"Dataset"
,
"IterableDataset"
,
"TensorDataset"
,
"ComposeDataset"
,
"ChainDataset"
"ChainDataset"
,
"random_split"
,
"Subset"
]
]
...
@@ -400,3 +400,131 @@ class ChainDataset(IterableDataset):
...
@@ -400,3 +400,131 @@ class ChainDataset(IterableDataset):
for
dataset
in
self
.
datasets
:
for
dataset
in
self
.
datasets
:
for
sample
in
dataset
:
for
sample
in
dataset
:
yield
sample
yield
sample
class
Subset
(
Dataset
):
"""
Subset of a dataset at specified indices.
Args:
dataset (Dataset): The whole Dataset.
indices (sequence): Indices in the whole set selected for subset.
Returns:
Dataset: A Dataset which is the subset of the original dataset.
Example code:
.. code-block:: python
import paddle
from paddle.io import Subset
# example 1:
a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2])
print(list(a))
# [1, 3]
# example 2:
b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1])
print(list(b))
# [2, 2]
"""
def
__init__
(
self
,
dataset
,
indices
):
self
.
dataset
=
dataset
self
.
indices
=
indices
def
__getitem__
(
self
,
idx
):
return
self
.
dataset
[
self
.
indices
[
idx
]]
def
__len__
(
self
):
return
len
(
self
.
indices
)
def
random_split
(
dataset
,
lengths
,
generator
=
None
):
"""
Randomly split a dataset into non-overlapping new datasets of given lengths.
Optionally fix the generator for reproducible results, e.g.:
Args:
dataset (Dataset): Dataset to be split
lengths (sequence): lengths of splits to be produced
generator (Generator, optional): Generator used for the random permutation. Default is None then the DefaultGenerator is used in manual_seed().
Returns:
Datasets: A list of subset Datasets, which are the non-overlapping subsets of the original Dataset.
Example code:
.. code-block:: python
import paddle
from paddle.io import random_split
a_list = paddle.io.random_split(range(10), [3, 7])
print(len(a_list))
# 2
for idx, v in enumerate(a_list[0]):
print(idx, v)
# output of the first subset
# 0 1
# 1 3
# 2 9
for idx, v in enumerate(a_list[1]):
print(idx, v)
# output of the second subset
# 0 5
# 1 7
# 2 8
# 3 6
# 4 0
# 5 2
# 6 4
"""
# Cannot verify that dataset is Sized
if
sum
(
lengths
)
!=
len
(
dataset
):
# type: ignore
raise
ValueError
(
"Sum of input lengths does not equal the length of the input dataset!"
)
# TODO(@Joejiong): support Variable or Tensor type with .tolist class member function.
# For example var.item() and var.tolist()
indices
=
paddle
.
randperm
(
sum
(
lengths
)).
numpy
().
tolist
()
return
[
Subset
(
dataset
,
indices
[
offset
-
length
:
offset
])
for
offset
,
length
in
zip
(
_accumulate
(
lengths
),
lengths
)
]
def
_accumulate
(
iterable
,
fn
=
lambda
x
,
y
:
x
+
y
):
"""
Return running totals
Args:
iterable: any iterable object for example dataset.
y (x): one element in the iterable object.
fn (x, y): Defaults to lambdax.
Yields:
yields total from beginning iterator to current iterator.
Example code:
.. code-block:: python
_accumulate([1,2,3,4,5]) --> 1 3 6 10 15
_accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
"""
it
=
iter
(
iterable
)
try
:
total
=
next
(
it
)
except
StopIteration
:
return
yield
total
for
element
in
it
:
total
=
fn
(
total
,
element
)
yield
total
python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
100644 → 100755
浏览文件 @
50d3117d
...
@@ -20,8 +20,7 @@ import numpy as np
...
@@ -20,8 +20,7 @@ import numpy as np
import
paddle
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
paddle.io
import
Dataset
,
IterableDataset
,
TensorDataset
,
\
from
paddle.io
import
Dataset
,
IterableDataset
,
TensorDataset
,
\
ComposeDataset
,
ChainDataset
,
DataLoader
ComposeDataset
,
ChainDataset
,
DataLoader
,
random_split
,
Subset
from
paddle.fluid.dygraph.base
import
to_variable
IMAGE_SIZE
=
32
IMAGE_SIZE
=
32
...
@@ -54,14 +53,14 @@ class RandomIterableDataset(IterableDataset):
...
@@ -54,14 +53,14 @@ class RandomIterableDataset(IterableDataset):
class
TestTensorDataset
(
unittest
.
TestCase
):
class
TestTensorDataset
(
unittest
.
TestCase
):
def
run_main
(
self
,
num_workers
,
places
):
def
run_main
(
self
,
num_workers
,
places
):
fluid
.
default_startup_program
().
random_seed
=
1
paddle
.
static
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
paddle
.
static
.
default_main_program
().
random_seed
=
1
place
=
fluid
.
CPUPlace
()
place
=
paddle
.
CPUPlace
()
with
fluid
.
dygraph
.
guard
(
place
):
with
fluid
.
dygraph
.
guard
(
place
):
input_np
=
np
.
random
.
random
([
16
,
3
,
4
]).
astype
(
'float32'
)
input_np
=
np
.
random
.
random
([
16
,
3
,
4
]).
astype
(
'float32'
)
input
=
to_variable
(
input_np
)
input
=
paddle
.
to_tensor
(
input_np
)
label_np
=
np
.
random
.
random
([
16
,
1
]).
astype
(
'int32'
)
label_np
=
np
.
random
.
random
([
16
,
1
]).
astype
(
'int32'
)
label
=
to_variable
(
label_np
)
label
=
paddle
.
to_tensor
(
label_np
)
dataset
=
TensorDataset
([
input
,
label
])
dataset
=
TensorDataset
([
input
,
label
])
assert
len
(
dataset
)
==
16
assert
len
(
dataset
)
==
16
...
@@ -83,17 +82,17 @@ class TestTensorDataset(unittest.TestCase):
...
@@ -83,17 +82,17 @@ class TestTensorDataset(unittest.TestCase):
assert
np
.
allclose
(
label
.
numpy
(),
label_np
[
i
])
assert
np
.
allclose
(
label
.
numpy
(),
label_np
[
i
])
def
test_main
(
self
):
def
test_main
(
self
):
places
=
[
fluid
.
CPUPlace
()]
places
=
[
paddle
.
CPUPlace
()]
if
fluid
.
cor
e
.
is_compiled_with_cuda
():
if
paddl
e
.
is_compiled_with_cuda
():
places
.
append
(
fluid
.
CUDAPlace
(
0
))
places
.
append
(
paddle
.
CUDAPlace
(
0
))
for
p
in
places
:
for
p
in
places
:
self
.
run_main
(
num_workers
=
0
,
places
=
p
)
self
.
run_main
(
num_workers
=
0
,
places
=
p
)
class
TestComposeDataset
(
unittest
.
TestCase
):
class
TestComposeDataset
(
unittest
.
TestCase
):
def
test_main
(
self
):
def
test_main
(
self
):
fluid
.
default_startup_program
().
random_seed
=
1
paddle
.
static
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
paddle
.
static
.
default_main_program
().
random_seed
=
1
dataset1
=
RandomDataset
(
10
)
dataset1
=
RandomDataset
(
10
)
dataset2
=
RandomDataset
(
10
)
dataset2
=
RandomDataset
(
10
)
...
@@ -110,10 +109,104 @@ class TestComposeDataset(unittest.TestCase):
...
@@ -110,10 +109,104 @@ class TestComposeDataset(unittest.TestCase):
assert
np
.
allclose
(
label2
,
label2_t
)
assert
np
.
allclose
(
label2
,
label2_t
)
class
TestRandomSplitApi
(
unittest
.
TestCase
):
def
test_main
(
self
):
paddle
.
static
.
default_startup_program
().
random_seed
=
1
paddle
.
static
.
default_main_program
().
random_seed
=
1
dataset1
,
dataset2
=
paddle
.
io
.
random_split
(
range
(
5
),
[
1
,
4
])
self
.
assertTrue
(
len
(
dataset1
)
==
1
)
self
.
assertTrue
(
len
(
dataset2
)
==
4
)
elements_list
=
list
(
range
(
5
))
for
_
,
val
in
enumerate
(
dataset1
):
elements_list
.
remove
(
val
)
for
_
,
val
in
enumerate
(
dataset2
):
elements_list
.
remove
(
val
)
self
.
assertTrue
(
len
(
elements_list
)
==
0
)
class
TestRandomSplitError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
paddle
.
static
.
default_startup_program
().
random_seed
=
1
paddle
.
static
.
default_main_program
().
random_seed
=
1
self
.
assertRaises
(
ValueError
,
paddle
.
io
.
random_split
,
range
(
5
),
[
3
,
8
])
self
.
assertRaises
(
ValueError
,
paddle
.
io
.
random_split
,
range
(
5
),
[
8
])
self
.
assertRaises
(
ValueError
,
paddle
.
io
.
random_split
,
range
(
5
),
[])
class
TestSubsetDataset
(
unittest
.
TestCase
):
def
run_main
(
self
,
num_workers
,
places
):
paddle
.
static
.
default_startup_program
().
random_seed
=
1
paddle
.
static
.
default_main_program
().
random_seed
=
1
input_np
=
np
.
random
.
random
([
5
,
3
,
4
]).
astype
(
'float32'
)
input
=
paddle
.
to_tensor
(
input_np
)
label_np
=
np
.
random
.
random
([
5
,
1
]).
astype
(
'int32'
)
label
=
paddle
.
to_tensor
(
label_np
)
dataset
=
TensorDataset
([
input
,
label
])
even_subset
=
paddle
.
io
.
Subset
(
dataset
,
[
0
,
2
,
4
])
odd_subset
=
paddle
.
io
.
Subset
(
dataset
,
[
1
,
3
])
assert
len
(
dataset
)
==
5
def
prepare_dataloader
(
dataset
):
return
DataLoader
(
dataset
,
places
=
places
,
num_workers
=
num_workers
,
batch_size
=
1
,
drop_last
=
True
)
dataloader
=
prepare_dataloader
(
dataset
)
dataloader_even
=
prepare_dataloader
(
even_subset
)
dataloader_odd
=
prepare_dataloader
(
odd_subset
)
def
assert_basic
(
input
,
label
):
assert
len
(
input
)
==
1
assert
len
(
label
)
==
1
assert
input
.
shape
==
[
1
,
3
,
4
]
assert
label
.
shape
==
[
1
,
1
]
assert
isinstance
(
input
,
paddle
.
Tensor
)
assert
isinstance
(
label
,
paddle
.
Tensor
)
elements_list
=
list
()
for
_
,
(
input
,
label
)
in
enumerate
(
dataloader
()):
assert_basic
(
input
,
label
)
elements_list
.
append
(
label
)
for
_
,
(
input
,
label
)
in
enumerate
(
dataloader_even
()):
assert_basic
(
input
,
label
)
elements_list
.
remove
(
label
)
odd_list
=
list
()
for
_
,
(
input
,
label
)
in
enumerate
(
dataloader_odd
()):
assert_basic
(
input
,
label
)
odd_list
.
append
(
label
)
self
.
assertEqual
(
odd_list
,
elements_list
)
def
test_main
(
self
):
paddle
.
static
.
default_startup_program
().
random_seed
=
1
paddle
.
static
.
default_main_program
().
random_seed
=
1
places
=
[
paddle
.
CPUPlace
()]
if
paddle
.
is_compiled_with_cuda
():
places
.
append
(
paddle
.
CUDAPlace
(
0
))
for
p
in
places
:
self
.
run_main
(
num_workers
=
0
,
places
=
p
)
class
TestChainDataset
(
unittest
.
TestCase
):
class
TestChainDataset
(
unittest
.
TestCase
):
def
run_main
(
self
,
num_workers
,
places
):
def
run_main
(
self
,
num_workers
,
places
):
fluid
.
default_startup_program
().
random_seed
=
1
paddle
.
static
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
paddle
.
static
.
default_main_program
().
random_seed
=
1
dataset1
=
RandomIterableDataset
(
10
)
dataset1
=
RandomIterableDataset
(
10
)
dataset2
=
RandomIterableDataset
(
10
)
dataset2
=
RandomIterableDataset
(
10
)
...
@@ -135,9 +228,9 @@ class TestChainDataset(unittest.TestCase):
...
@@ -135,9 +228,9 @@ class TestChainDataset(unittest.TestCase):
idx
+=
1
idx
+=
1
def
test_main
(
self
):
def
test_main
(
self
):
places
=
[
fluid
.
CPUPlace
()]
places
=
[
paddle
.
CPUPlace
()]
if
fluid
.
cor
e
.
is_compiled_with_cuda
():
if
paddl
e
.
is_compiled_with_cuda
():
places
.
append
(
fluid
.
CUDAPlace
(
0
))
places
.
append
(
paddle
.
CUDAPlace
(
0
))
for
p
in
places
:
for
p
in
places
:
self
.
run_main
(
num_workers
=
0
,
places
=
p
)
self
.
run_main
(
num_workers
=
0
,
places
=
p
)
...
...
python/paddle/io/__init__.py
100644 → 100755
浏览文件 @
50d3117d
...
@@ -28,9 +28,11 @@ __all__ = [
...
@@ -28,9 +28,11 @@ __all__ = [
'SequenceSampler'
,
'SequenceSampler'
,
'RandomSampler'
,
'RandomSampler'
,
'WeightedRandomSampler'
,
'WeightedRandomSampler'
,
'random_split'
,
'Subset'
]
]
from
..fluid.io
import
DataLoader
from
..fluid.io
import
DataLoader
from
..fluid.dataloader
import
Dataset
,
IterableDataset
,
BatchSampler
,
get_worker_info
,
\
from
..fluid.dataloader
import
Dataset
,
IterableDataset
,
BatchSampler
,
get_worker_info
,
\
TensorDataset
,
Sampler
,
SequenceSampler
,
RandomSampler
,
DistributedBatchSampler
,
\
TensorDataset
,
Sampler
,
SequenceSampler
,
RandomSampler
,
DistributedBatchSampler
,
\
ComposeDataset
,
ChainDataset
,
WeightedRandomSampler
ComposeDataset
,
ChainDataset
,
WeightedRandomSampler
,
Subset
,
random_split
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录