Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
7c9b1ab6
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7c9b1ab6
编写于
8月 03, 2023
作者:
N
Nyakku Shigure
提交者:
GitHub
8月 03, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[xdoctest] reformat example code with google style in `paddle/io` (#55732)
上级
ff226ba1
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
524 addition
and
465 deletion
+524
-465
python/paddle/io/dataloader/batch_sampler.py
python/paddle/io/dataloader/batch_sampler.py
+83
-79
python/paddle/io/dataloader/dataset.py
python/paddle/io/dataloader/dataset.py
+232
-206
python/paddle/io/dataloader/sampler.py
python/paddle/io/dataloader/sampler.py
+97
-73
python/paddle/io/dataloader/worker.py
python/paddle/io/dataloader/worker.py
+52
-41
python/paddle/io/multiprocess_utils.py
python/paddle/io/multiprocess_utils.py
+0
-2
python/paddle/io/reader.py
python/paddle/io/reader.py
+60
-64
未找到文件。
python/paddle/io/dataloader/batch_sampler.py
浏览文件 @
7c9b1ab6
...
@@ -58,40 +58,44 @@ class BatchSampler(Sampler):
...
@@ -58,40 +58,44 @@ class BatchSampler(Sampler):
.. code-block:: python
.. code-block:: python
from paddle.io import RandomSampler, BatchSampler, Dataset
>>> import numpy as np
>>> from paddle.io import RandomSampler, BatchSampler, Dataset
# init with dataset
class RandomDataset(Dataset):
>>> np.random.seed(2023)
def __init__(self, num_samples):
>>> # init with dataset
self.num_samples = num_samples
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
def __getitem__(self, idx):
... self.num_samples = num_samples
image = np.random.random([784]).astype('float32')
...
label = np.random.randint(0, 9, (1, )).astype('int64')
... def __getitem__(self, idx):
return image, label
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
def __len__(self):
... return image, label
return self.num_samples
...
... def __len__(self):
bs = BatchSampler(dataset=RandomDataset(100),
... return self.num_samples
shuffle=False,
...
batch_size=16,
>>> bs = BatchSampler(dataset=RandomDataset(100),
drop_last=False)
... shuffle=False,
... batch_size=16,
for batch_indices in bs:
... drop_last=False)
print(batch_indices)
...
>>> for batch_indices in bs:
# init with sampler
... print(batch_indices)
sampler = RandomSampler(RandomDataset(100))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
bs = BatchSampler(sampler=sampler,
...
batch_size=8,
[96, 97, 98, 99]
drop_last=True)
>>> # init with sampler
>>> sampler = RandomSampler(RandomDataset(100))
for batch_indices in bs:
>>> bs = BatchSampler(sampler=sampler,
print(batch_indices)
... batch_size=8,
... drop_last=True)
...
>>> for batch_indices in bs:
... print(batch_indices)
[56, 12, 68, 0, 82, 66, 91, 44]
...
[53, 17, 22, 86, 52, 3, 92, 33]
"""
"""
def
__init__
(
def
__init__
(
...
@@ -203,29 +207,29 @@ class DistributedBatchSampler(BatchSampler):
...
@@ -203,29 +207,29 @@ class DistributedBatchSampler(BatchSampler):
Examples:
Examples:
.. code-block:: python
.. code-block:: python
import numpy as np
>>>
import numpy as np
from paddle.io import Dataset, DistributedBatchSampler
>>>
from paddle.io import Dataset, DistributedBatchSampler
# init with dataset
>>>
# init with dataset
class RandomDataset(Dataset):
>>>
class RandomDataset(Dataset):
def __init__(self, num_samples):
...
def __init__(self, num_samples):
self.num_samples = num_samples
...
self.num_samples = num_samples
...
def __getitem__(self, idx):
...
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
...
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
...
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
...
return image, label
...
def __len__(self):
...
def __len__(self):
return self.num_samples
...
return self.num_samples
...
dataset = RandomDataset(100)
>>>
dataset = RandomDataset(100)
sampler = DistributedBatchSampler(dataset, batch_size=64)
>>>
sampler = DistributedBatchSampler(dataset, batch_size=64)
for data in sampler:
>>>
for data in sampler:
# do something
...
# do something
break
...
break
"""
"""
def
__init__
(
def
__init__
(
...
@@ -339,27 +343,27 @@ class DistributedBatchSampler(BatchSampler):
...
@@ -339,27 +343,27 @@ class DistributedBatchSampler(BatchSampler):
Examples:
Examples:
.. code-block:: python
.. code-block:: python
import numpy as np
>>>
import numpy as np
from paddle.io import Dataset, DistributedBatchSampler
>>>
from paddle.io import Dataset, DistributedBatchSampler
# init with dataset
>>>
# init with dataset
class RandomDataset(Dataset):
>>>
class RandomDataset(Dataset):
def __init__(self, num_samples):
...
def __init__(self, num_samples):
self.num_samples = num_samples
...
self.num_samples = num_samples
...
def __getitem__(self, idx):
...
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
...
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
...
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
...
return image, label
...
def __len__(self):
...
def __len__(self):
return self.num_samples
...
return self.num_samples
...
dataset = RandomDataset(100)
>>>
dataset = RandomDataset(100)
sampler = DistributedBatchSampler(dataset, batch_size=64)
>>>
sampler = DistributedBatchSampler(dataset, batch_size=64)
for epoch in range(10):
>>>
for epoch in range(10):
sampler.set_epoch(epoch)
...
sampler.set_epoch(epoch)
"""
"""
self
.
epoch
=
epoch
self
.
epoch
=
epoch
python/paddle/io/dataloader/dataset.py
浏览文件 @
7c9b1ab6
...
@@ -37,26 +37,26 @@ class Dataset:
...
@@ -37,26 +37,26 @@ class Dataset:
.. code-block:: python
.. code-block:: python
import numpy as np
>>>
import numpy as np
from paddle.io import Dataset
>>>
from paddle.io import Dataset
# define a random dataset
>>>
# define a random dataset
class RandomDataset(Dataset):
>>>
class RandomDataset(Dataset):
def __init__(self, num_samples):
...
def __init__(self, num_samples):
self.num_samples = num_samples
...
self.num_samples = num_samples
...
def __getitem__(self, idx):
...
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
...
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
...
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
...
return image, label
...
def __len__(self):
...
def __len__(self):
return self.num_samples
...
return self.num_samples
...
dataset = RandomDataset(10)
>>>
dataset = RandomDataset(10)
for i in range(len(dataset)):
>>>
for i in range(len(dataset)):
print(dataset[i])
... image, label = dataset[i]
... # do something
"""
"""
def
__init__
(
self
):
def
__init__
(
self
):
...
@@ -95,23 +95,24 @@ class IterableDataset(Dataset):
...
@@ -95,23 +95,24 @@ class IterableDataset(Dataset):
.. code-block:: python
.. code-block:: python
:name: code-example1
:name: code-example1
import numpy as np
>>> import numpy as np
from paddle.io import IterableDataset
>>> from paddle.io import IterableDataset
# define a random dataset
>>> # define a random dataset
class RandomDataset(IterableDataset):
>>> class RandomDataset(IterableDataset):
def __init__(self, num_samples):
... def __init__(self, num_samples):
self.num_samples = num_samples
... self.num_samples = num_samples
...
def __iter__(self):
... def __iter__(self):
for i in range(self.num_samples):
... for i in range(self.num_samples):
image = np.random.random([784]).astype('float32')
... image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
... label = np.random.randint(0, 9, (1, )).astype('int64')
yield image, label
... yield image, label
...
dataset = RandomDataset(10)
>>> dataset = RandomDataset(10)
for img, lbl in dataset:
>>> for img, label in dataset:
print(img, lbl)
... # do something
... ...
When :attr:`num_workers > 0`, each worker has a different copy of the dataset object and
When :attr:`num_workers > 0`, each worker has a different copy of the dataset object and
will yield whole dataset samples, which means samples in dataset will be repeated in
will yield whole dataset samples, which means samples in dataset will be repeated in
...
@@ -125,87 +126,113 @@ class IterableDataset(Dataset):
...
@@ -125,87 +126,113 @@ class IterableDataset(Dataset):
.. code-block:: python
.. code-block:: python
:name: code-example2
:name: code-example2
import math
>>> import math
import paddle
>>> import paddle
import numpy as np
>>> import numpy as np
from paddle.io import IterableDataset, DataLoader, get_worker_info
>>> from paddle.io import IterableDataset, DataLoader, get_worker_info
class SplitedIterableDataset(IterableDataset):
>>> class SplitedIterableDataset(IterableDataset):
def __init__(self, start, end):
... def __init__(self, start, end):
self.start = start
... self.start = start
self.end = end
... self.end = end
...
def __iter__(self):
... def __iter__(self):
worker_info = get_worker_info()
... worker_info = get_worker_info()
if worker_info is None:
... if worker_info is None:
iter_start = self.start
... iter_start = self.start
iter_end = self.end
... iter_end = self.end
else:
... else:
per_worker = int(
... per_worker = int(
math.ceil((self.end - self.start) / float(
... math.ceil((self.end - self.start) / float(
worker_info.num_workers)))
... worker_info.num_workers)))
worker_id = worker_info.id
... worker_id = worker_info.id
iter_start = self.start + worker_id * per_worker
... iter_start = self.start + worker_id * per_worker
iter_end = min(iter_start + per_worker, self.end)
... iter_end = min(iter_start + per_worker, self.end)
...
for i in range(iter_start, iter_end):
... for i in range(iter_start, iter_end):
yield np.array([i])
... yield np.array([i])
...
dataset = SplitedIterableDataset(start=2, end=9)
>>> dataset = SplitedIterableDataset(start=2, end=9)
dataloader = DataLoader(
>>> dataloader = DataLoader(
dataset,
... dataset,
num_workers=2,
... num_workers=2,
batch_size=1,
... batch_size=1,
drop_last=True)
... drop_last=True)
...
for data in dataloader:
>>> for data in dataloader:
print(data)
... print(data) # doctest: +SKIP("The output depends on the environment.")
# outputs: [2, 5, 3, 6, 4, 7]
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[2]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[3]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[4]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[5]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[6]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[7]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[8]])
splitting data copy in each worker by :code:`worker_init_fn`
splitting data copy in each worker by :code:`worker_init_fn`
.. code-block:: python
.. code-block:: python
:name: code-example3
:name: code-example3
import math
>>> import math
import paddle
>>> import paddle
import numpy as np
>>> import numpy as np
from paddle.io import IterableDataset, DataLoader, get_worker_info
>>> from paddle.io import IterableDataset, DataLoader, get_worker_info
class RangeIterableDataset(IterableDataset):
>>> class RangeIterableDataset(IterableDataset):
def __init__(self, start, end):
... def __init__(self, start, end):
self.start = start
... self.start = start
self.end = end
... self.end = end
...
def __iter__(self):
... def __iter__(self):
for i in range(self.start, self.end):
... for i in range(self.start, self.end):
yield np.array([i])
... yield np.array([i])
...
dataset = RangeIterableDataset(start=2, end=9)
>>> dataset = RangeIterableDataset(start=2, end=9)
def worker_init_fn(worker_id):
>>> def worker_init_fn(worker_id):
worker_info = get_worker_info()
... worker_info = get_worker_info()
...
dataset = worker_info.dataset
... dataset = worker_info.dataset
start = dataset.start
... start = dataset.start
end = dataset.end
... end = dataset.end
num_per_worker = int(
... num_per_worker = int(
math.ceil((end - start) / float(worker_info.num_workers)))
... math.ceil((end - start) / float(worker_info.num_workers)))
...
worker_id = worker_info.id
... worker_id = worker_info.id
dataset.start = start + worker_id * num_per_worker
... dataset.start = start + worker_id * num_per_worker
dataset.end = min(dataset.start + num_per_worker, end)
... dataset.end = min(dataset.start + num_per_worker, end)
...
dataloader = DataLoader(
>>> dataloader = DataLoader(
dataset,
... dataset,
num_workers=2,
... num_workers=2,
batch_size=1,
... batch_size=1,
drop_last=True,
... drop_last=True,
worker_init_fn=worker_init_fn)
... worker_init_fn=worker_init_fn)
...
for data in dataloader:
>>> for data in dataloader:
print(data)
... print(data) # doctest: +SKIP("The output depends on the environment.")
# outputs: [2, 5, 3, 6, 4, 7]
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[2]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[3]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[4]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[5]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[6]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[7]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[8]])
"""
"""
...
@@ -249,22 +276,21 @@ class TensorDataset(Dataset):
...
@@ -249,22 +276,21 @@ class TensorDataset(Dataset):
.. code-block:: python
.. code-block:: python
import numpy as np
>>>
import numpy as np
import paddle
>>>
import paddle
from paddle.io import TensorDataset
>>>
from paddle.io import TensorDataset
input_np = np.random.random([2, 3, 4]).astype('float32')
>>>
input_np = np.random.random([2, 3, 4]).astype('float32')
input = paddle.to_tensor(input_np)
>>>
input = paddle.to_tensor(input_np)
label_np = np.random.random([2, 1]).astype('int32')
>>>
label_np = np.random.random([2, 1]).astype('int32')
label = paddle.to_tensor(label_np)
>>>
label = paddle.to_tensor(label_np)
dataset = TensorDataset([input, label])
>>> dataset = TensorDataset([input, label])
for i in range(len(dataset)):
input, label = dataset[i]
print(input, label)
>>> for i in range(len(dataset)):
... input, label = dataset[i]
... # do something
"""
"""
def
__init__
(
self
,
tensors
):
def
__init__
(
self
,
tensors
):
...
@@ -309,32 +335,28 @@ class ComposeDataset(Dataset):
...
@@ -309,32 +335,28 @@ class ComposeDataset(Dataset):
.. code-block:: python
.. code-block:: python
import numpy as np
>>> import numpy as np
import paddle
>>> import paddle
from paddle.io import Dataset, ComposeDataset
>>> from paddle.io import Dataset, ComposeDataset
# define a random dataset
>>> # define a random dataset
class RandomDataset(Dataset):
>>> class RandomDataset(Dataset):
def __init__(self, num_samples):
... def __init__(self, num_samples):
self.num_samples = num_samples
... self.num_samples = num_samples
...
def __getitem__(self, idx):
... def __getitem__(self, idx):
image = np.random.random([32]).astype('float32')
... image = np.random.random([32]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
... label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
... return image, label
...
def __len__(self):
... def __len__(self):
return self.num_samples
... return self.num_samples
...
dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)])
>>> dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)])
for i in range(len(dataset)):
>>> for i in range(len(dataset)):
image1, label1, image2, label2 = dataset[i]
... image1, label1, image2, label2 = dataset[i]
print(image1)
... # do something
print(label1)
print(image2)
print(label2)
"""
"""
def
__init__
(
self
,
datasets
):
def
__init__
(
self
,
datasets
):
...
@@ -379,25 +401,26 @@ class ChainDataset(IterableDataset):
...
@@ -379,25 +401,26 @@ class ChainDataset(IterableDataset):
.. code-block:: python
.. code-block:: python
import numpy as np
>>> import numpy as np
import paddle
>>> import paddle
from paddle.io import IterableDataset, ChainDataset
>>> from paddle.io import IterableDataset, ChainDataset
# define a random dataset
>>> # define a random dataset
class RandomDataset(IterableDataset):
>>> class RandomDataset(IterableDataset):
def __init__(self, num_samples):
... def __init__(self, num_samples):
self.num_samples = num_samples
... self.num_samples = num_samples
...
def __iter__(self):
... def __iter__(self):
for i in range(10):
... for i in range(10):
image = np.random.random([32]).astype('float32')
... image = np.random.random([32]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
... label = np.random.randint(0, 9, (1, )).astype('int64')
yield image, label
... yield image, label
...
dataset = ChainDataset([RandomDataset(10), RandomDataset(10)])
>>> dataset = ChainDataset([RandomDataset(10), RandomDataset(10)])
for image, label in iter(dataset):
>>> for image, label in iter(dataset):
print(image, label)
... # do something
... ...
"""
"""
...
@@ -430,18 +453,18 @@ class Subset(Dataset):
...
@@ -430,18 +453,18 @@ class Subset(Dataset):
.. code-block:: python
.. code-block:: python
import paddle
>>>
import paddle
from paddle.io import Subset
>>>
from paddle.io import Subset
# example 1:
>>>
# example 1:
a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2])
>>>
a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2])
print(list(a))
>>>
print(list(a))
#
[1, 3]
[1, 3]
# example 2:
>>>
# example 2:
b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1])
>>>
b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1])
print(list(b))
>>>
print(list(b))
#
[2, 2]
[2, 2]
"""
"""
def
__init__
(
self
,
dataset
,
indices
):
def
__init__
(
self
,
dataset
,
indices
):
...
@@ -472,31 +495,30 @@ def random_split(dataset, lengths, generator=None):
...
@@ -472,31 +495,30 @@ def random_split(dataset, lengths, generator=None):
.. code-block:: python
.. code-block:: python
import paddle
>>> import paddle
from paddle.io import random_split
>>> paddle.seed(2023)
a_list = paddle.io.random_split(range(10), [3, 7])
>>> a_list = paddle.io.random_split(range(10), [3, 7])
print(len(a_list))
>>> print(len(a_list))
# 2
2
for idx, v in enumerate(a_list[0]):
>>> # output of the first subset
print(idx, v)
>>> for idx, v in enumerate(a_list[0]):
... print(idx, v) # doctest: +SKIP("The output depends on the environment.")
# output of the first subset
0 7
# 0 1
1 6
# 1 3
2 5
# 2 9
>>> # output of the second subset
for idx, v in enumerate(a_list[1]):
>>> for idx, v in enumerate(a_list[1]):
print(idx, v)
... print(idx, v) # doctest: +SKIP("The output depends on the environment.")
# output of the second subset
0 1
# 0 5
1 9
# 1 7
2 4
# 2 8
3 2
# 3 6
4 0
# 4 0
5 3
# 5 2
6 8
# 6 4
"""
"""
# Cannot verify that dataset is Sized
# Cannot verify that dataset is Sized
if
sum
(
lengths
)
!=
len
(
dataset
):
# type: ignore
if
sum
(
lengths
)
!=
len
(
dataset
):
# type: ignore
...
@@ -528,8 +550,12 @@ def _accumulate(iterable, fn=lambda x, y: x + y):
...
@@ -528,8 +550,12 @@ def _accumulate(iterable, fn=lambda x, y: x + y):
.. code-block:: python
.. code-block:: python
_accumulate([1,2,3,4,5]) --> 1 3 6 10 15
>>> list(_accumulate([1, 2, 3, 4, 5]))
_accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
[1, 3, 6, 10, 15]
>>> import operator
>>> list(_accumulate([1, 2, 3, 4, 5], operator.mul))
[1, 2, 6, 24, 120]
"""
"""
it
=
iter
(
iterable
)
it
=
iter
(
iterable
)
...
...
python/paddle/io/dataloader/sampler.py
浏览文件 @
7c9b1ab6
...
@@ -44,34 +44,39 @@ class Sampler:
...
@@ -44,34 +44,39 @@ class Sampler:
.. code-block:: python
.. code-block:: python
from paddle.io import Dataset, Sampler
>>> from paddle.io import Dataset, Sampler
class RandomDataset(Dataset):
>>> class RandomDataset(Dataset):
def __init__(self, num_samples):
... def __init__(self, num_samples):
self.num_samples = num_samples
... self.num_samples = num_samples
...
def __getitem__(self, idx):
... def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
... image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
... label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
... return image, label
...
def __len__(self):
... def __len__(self):
return self.num_samples
... return self.num_samples
...
class MySampler(Sampler):
>>> class MySampler(Sampler):
def __init__(self, data_source):
... def __init__(self, data_source):
self.data_source = data_source
... self.data_source = data_source
...
def __iter__(self):
... def __iter__(self):
return iter(range(len(self.data_source)))
... return iter(range(len(self.data_source)))
...
def __len__(self):
... def __len__(self):
return len(self.data_source)
... return len(self.data_source)
...
sampler = MySampler(data_source=RandomDataset(100))
>>> sampler = MySampler(data_source=RandomDataset(100))
for index in sampler:
>>> for index in sampler:
print(index)
... print(index)
0
1
2
...
99
see `paddle.io.BatchSampler`
see `paddle.io.BatchSampler`
see `paddle.io.DataLoader`
see `paddle.io.DataLoader`
...
@@ -105,24 +110,29 @@ class SequenceSampler(Sampler):
...
@@ -105,24 +110,29 @@ class SequenceSampler(Sampler):
.. code-block:: python
.. code-block:: python
from paddle.io import Dataset, SequenceSampler
>>> from paddle.io import Dataset, SequenceSampler
class RandomDataset(Dataset):
>>> class RandomDataset(Dataset):
def __init__(self, num_samples):
... def __init__(self, num_samples):
self.num_samples = num_samples
... self.num_samples = num_samples
...
def __getitem__(self, idx):
... def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
... image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
... label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
... return image, label
...
def __len__(self):
... def __len__(self):
return self.num_samples
... return self.num_samples
...
sampler = SequenceSampler(data_source=RandomDataset(100))
>>> sampler = SequenceSampler(data_source=RandomDataset(100))
for index in sampler:
>>> for index in sampler:
print(index)
... print(index)
0
1
2
...
99
see `paddle.io.Sampler`
see `paddle.io.Sampler`
"""
"""
...
@@ -160,25 +170,31 @@ class RandomSampler(Sampler):
...
@@ -160,25 +170,31 @@ class RandomSampler(Sampler):
.. code-block:: python
.. code-block:: python
from paddle.io import Dataset, RandomSampler
>>> import numpy as np
>>> from paddle.io import Dataset, RandomSampler
class RandomDataset(Dataset):
def __init__(self, num_samples):
>>> np.random.seed(2023)
self.num_samples = num_samples
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
def __getitem__(self, idx):
... self.num_samples = num_samples
image = np.random.random([784]).astype('float32')
...
label = np.random.randint(0, 9, (1, )).astype('int64')
... def __getitem__(self, idx):
return image, label
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
def __len__(self):
... return image, label
return self.num_samples
...
... def __len__(self):
sampler = RandomSampler(data_source=RandomDataset(100))
... return self.num_samples
...
for index in sampler:
>>> sampler = RandomSampler(data_source=RandomDataset(100))
print(index)
>>> for index in sampler:
... print(index)
56
12
68
...
87
"""
"""
def
__init__
(
def
__init__
(
...
@@ -288,14 +304,22 @@ class WeightedRandomSampler(Sampler):
...
@@ -288,14 +304,22 @@ class WeightedRandomSampler(Sampler):
.. code-block:: python
.. code-block:: python
from paddle.io import WeightedRandomSampler
>>> import numpy as np
>>> from paddle.io import WeightedRandomSampler
sampler = WeightedRandomSampler(weights=[0.1, 0.3, 0.5, 0.7, 0.2],
num_samples=5,
>>> np.random.seed(2023)
replacement=True)
>>> sampler = WeightedRandomSampler(
... weights=[0.1, 0.3, 0.5, 0.7, 0.2],
for index in sampler:
... num_samples=5,
print(index)
... replacement=True
... )
>>> for index in sampler:
... print(index)
2
4
3
1
1
"""
"""
def
__init__
(
self
,
weights
,
num_samples
,
replacement
=
True
):
def
__init__
(
self
,
weights
,
num_samples
,
replacement
=
True
):
...
...
python/paddle/io/dataloader/worker.py
浏览文件 @
7c9b1ab6
...
@@ -13,8 +13,6 @@
...
@@ -13,8 +13,6 @@
# limitations under the License.
# limitations under the License.
import
os
import
os
# NOTE: queue has a different name in python2 and python3
import
queue
import
queue
import
sys
import
sys
import
traceback
import
traceback
...
@@ -94,51 +92,64 @@ def get_worker_info():
...
@@ -94,51 +92,64 @@ def get_worker_info():
Returns:
Returns:
WorkerInfo: an instance of WorkerInfo which contains fields above.
WorkerInfo: an instance of WorkerInfo which contains fields above.
.. note:
:
Notes
:
For more usage and examples, please see :code:`paddle.io.IterableDataset`
For more usage and examples, please see :code:`paddle.io.IterableDataset`
Example:
Example:
.. code-block:: python
.. code-block:: python
import math
>>> import math
import paddle
>>> import paddle
import numpy as np
>>> import numpy as np
from paddle.io import IterableDataset, DataLoader, get_worker_info
>>> from paddle.io import IterableDataset, DataLoader, get_worker_info
class SplitedIterableDataset(IterableDataset):
>>> class SplitedIterableDataset(IterableDataset):
def __init__(self, start, end):
... def __init__(self, start, end):
self.start = start
... self.start = start
self.end = end
... self.end = end
...
def __iter__(self):
... def __iter__(self):
worker_info = get_worker_info()
... worker_info = get_worker_info()
if worker_info is None:
... if worker_info is None:
iter_start = self.start
... iter_start = self.start
iter_end = self.end
... iter_end = self.end
else:
... else:
per_worker = int(
... per_worker = int(
math.ceil((self.end - self.start) / float(
... math.ceil((self.end - self.start) / float(
worker_info.num_workers)))
... worker_info.num_workers)))
worker_id = worker_info.id
... worker_id = worker_info.id
iter_start = self.start + worker_id * per_worker
... iter_start = self.start + worker_id * per_worker
iter_end = min(iter_start + per_worker, self.end)
... iter_end = min(iter_start + per_worker, self.end)
...
for i in range(iter_start, iter_end):
... for i in range(iter_start, iter_end):
yield np.array([i])
... yield np.array([i])
...
place = paddle.CPUPlace()
>>> place = paddle.CPUPlace()
dataset = SplitedIterableDataset(start=2, end=9)
>>> dataset = SplitedIterableDataset(start=2, end=9)
dataloader = DataLoader(
>>> dataloader = DataLoader(
dataset,
... dataset,
places=place,
... places=place,
num_workers=2,
... num_workers=2,
batch_size=1,
... batch_size=1,
drop_last=True)
... drop_last=True)
...
for data in dataloader:
>>> for data in dataloader:
print(data)
... print(data) # doctest: +SKIP("The output depends on the environment.")
# outputs: [2, 5, 3, 6, 4, 7]
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[2]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[6]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[3]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[7]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[4]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[8]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[5]])
"""
"""
return
_worker_info
return
_worker_info
...
...
python/paddle/io/multiprocess_utils.py
浏览文件 @
7c9b1ab6
...
@@ -13,8 +13,6 @@
...
@@ -13,8 +13,6 @@
# limitations under the License.
# limitations under the License.
import
atexit
import
atexit
# NOTE: queue has a different name in python2 and python3
import
queue
import
queue
import
signal
import
signal
import
sys
import
sys
...
...
python/paddle/io/reader.py
浏览文件 @
7c9b1ab6
...
@@ -14,8 +14,6 @@
...
@@ -14,8 +14,6 @@
import
copy
import
copy
import
multiprocessing
import
multiprocessing
# NOTE: queue has a different name in python2 and python3
import
sys
import
sys
import
time
import
time
import
warnings
import
warnings
...
@@ -234,7 +232,7 @@ class DataLoader:
...
@@ -234,7 +232,7 @@ class DataLoader:
For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler`
For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler`
.. note:
:
Notes
:
GPU tensor operation is not supported in subprocess currently,
GPU tensor operation is not supported in subprocess currently,
please don't use GPU tensor operations in pipeline which will
please don't use GPU tensor operations in pipeline which will
be performed in subprocess, such as dataset transforms, collte_fn,
be performed in subprocess, such as dataset transforms, collte_fn,
...
@@ -250,7 +248,7 @@ class DataLoader:
...
@@ -250,7 +248,7 @@ class DataLoader:
:attr:`collate_fn` or :attr:`default_collate_fn`.
:attr:`collate_fn` or :attr:`default_collate_fn`.
.. note:
:
Notes
:
When automatic batching is disabled, :attr:`default_collate_fn` will
When automatic batching is disabled, :attr:`default_collate_fn` will
do nothing to data from dataset.
do nothing to data from dataset.
...
@@ -321,68 +319,66 @@ class DataLoader:
...
@@ -321,68 +319,66 @@ class DataLoader:
.. code-block:: python
.. code-block:: python
import numpy as np
>>> import numpy as np
import paddle
>>> import paddle
import paddle.nn as nn
>>> import paddle.nn as nn
import paddle.nn.functional as F
>>> import paddle.nn.functional as F
from paddle.io import Dataset, BatchSampler, DataLoader
>>> from paddle.io import Dataset, BatchSampler, DataLoader
BATCH_NUM = 20
>>> BATCH_NUM = 20
BATCH_SIZE = 16
>>> BATCH_SIZE = 16
EPOCH_NUM = 4
>>> EPOCH_NUM = 4
IMAGE_SIZE = 784
>>> IMAGE_SIZE = 784
CLASS_NUM = 10
>>> CLASS_NUM = 10
# define a random dataset
>>> # define a random dataset
class RandomDataset(Dataset):
>>> class RandomDataset(Dataset):
def __init__(self, num_samples):
... def __init__(self, num_samples):
self.num_samples = num_samples
... self.num_samples = num_samples
...
def __getitem__(self, idx):
... def __getitem__(self, idx):
image = np.random.random([IMAGE_SIZE]).astype('float32')
... image = np.random.random([IMAGE_SIZE]).astype('float32')
label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
... label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
return image, label
... return image, label
...
def __len__(self):
... def __len__(self):
return self.num_samples
... return self.num_samples
...
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
>>> dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
class SimpleNet(nn.Layer):
>>> class SimpleNet(nn.Layer):
def __init__(self):
... def __init__(self):
super().__init__()
... super().__init__()
self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)
... self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)
...
def forward(self, image, label=None):
... def forward(self, image, label=None):
return self.fc(image)
... return self.fc(image)
...
simple_net = SimpleNet()
>>> simple_net = SimpleNet()
opt = paddle.optimizer.SGD(learning_rate=1e-3,
>>> opt = paddle.optimizer.SGD(learning_rate=1e-3,
parameters=simple_net.parameters())
... parameters=simple_net.parameters())
...
loader = DataLoader(dataset,
>>> loader = DataLoader(dataset,
batch_size=BATCH_SIZE,
... batch_size=BATCH_SIZE,
shuffle=True,
... shuffle=True,
drop_last=True,
... drop_last=True,
num_workers=2)
... num_workers=2)
...
for e in range(EPOCH_NUM):
>>> for e in range(EPOCH_NUM):
for i, (image, label) in enumerate(loader()):
... for i, (image, label) in enumerate(loader()):
out = simple_net(image)
... out = simple_net(image)
loss = F.cross_entropy(out, label)
... loss = F.cross_entropy(out, label)
avg_loss = paddle.mean(loss)
... avg_loss = paddle.mean(loss)
avg_loss.backward()
... avg_loss.backward()
opt.minimize(avg_loss)
... opt.minimize(avg_loss)
simple_net.clear_gradients()
... simple_net.clear_gradients()
print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
... print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
Notes:
.. note::
For reading iterable dataset with multiprocess Dataloader,
For reading iterable dataset with multiprocess Dataloader,
please see :code:`paddle.io.IterableDataset`
please see :code:`paddle.io.IterableDataset`
"""
"""
def
__init__
(
def
__init__
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录