Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
7c9b1ab6
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7c9b1ab6
编写于
8月 03, 2023
作者:
N
Nyakku Shigure
提交者:
GitHub
8月 03, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[xdoctest] reformat example code with google style in `paddle/io` (#55732)
上级
ff226ba1
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
524 addition
and
465 deletion
+524
-465
python/paddle/io/dataloader/batch_sampler.py
python/paddle/io/dataloader/batch_sampler.py
+83
-79
python/paddle/io/dataloader/dataset.py
python/paddle/io/dataloader/dataset.py
+232
-206
python/paddle/io/dataloader/sampler.py
python/paddle/io/dataloader/sampler.py
+97
-73
python/paddle/io/dataloader/worker.py
python/paddle/io/dataloader/worker.py
+52
-41
python/paddle/io/multiprocess_utils.py
python/paddle/io/multiprocess_utils.py
+0
-2
python/paddle/io/reader.py
python/paddle/io/reader.py
+60
-64
未找到文件。
python/paddle/io/dataloader/batch_sampler.py
浏览文件 @
7c9b1ab6
...
...
@@ -58,40 +58,44 @@ class BatchSampler(Sampler):
.. code-block:: python
from paddle.io import RandomSampler, BatchSampler, Dataset
# init with dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
bs = BatchSampler(dataset=RandomDataset(100),
shuffle=False,
batch_size=16,
drop_last=False)
for batch_indices in bs:
print(batch_indices)
# init with sampler
sampler = RandomSampler(RandomDataset(100))
bs = BatchSampler(sampler=sampler,
batch_size=8,
drop_last=True)
for batch_indices in bs:
print(batch_indices)
>>> import numpy as np
>>> from paddle.io import RandomSampler, BatchSampler, Dataset
>>> np.random.seed(2023)
>>> # init with dataset
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> bs = BatchSampler(dataset=RandomDataset(100),
... shuffle=False,
... batch_size=16,
... drop_last=False)
...
>>> for batch_indices in bs:
... print(batch_indices)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
...
[96, 97, 98, 99]
>>> # init with sampler
>>> sampler = RandomSampler(RandomDataset(100))
>>> bs = BatchSampler(sampler=sampler,
... batch_size=8,
... drop_last=True)
...
>>> for batch_indices in bs:
... print(batch_indices)
[56, 12, 68, 0, 82, 66, 91, 44]
...
[53, 17, 22, 86, 52, 3, 92, 33]
"""
def
__init__
(
...
...
@@ -203,29 +207,29 @@ class DistributedBatchSampler(BatchSampler):
Examples:
.. code-block:: python
import numpy as np
from paddle.io import Dataset, DistributedBatchSampler
# init with dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(100)
sampler = DistributedBatchSampler(dataset, batch_size=64)
for data in sampler:
# do something
break
>>>
import numpy as np
>>>
from paddle.io import Dataset, DistributedBatchSampler
>>>
# init with dataset
>>>
class RandomDataset(Dataset):
...
def __init__(self, num_samples):
...
self.num_samples = num_samples
...
...
def __getitem__(self, idx):
...
image = np.random.random([784]).astype('float32')
...
label = np.random.randint(0, 9, (1, )).astype('int64')
...
return image, label
...
...
def __len__(self):
...
return self.num_samples
...
>>>
dataset = RandomDataset(100)
>>>
sampler = DistributedBatchSampler(dataset, batch_size=64)
>>>
for data in sampler:
...
# do something
...
break
"""
def
__init__
(
...
...
@@ -339,27 +343,27 @@ class DistributedBatchSampler(BatchSampler):
Examples:
.. code-block:: python
import numpy as np
from paddle.io import Dataset, DistributedBatchSampler
# init with dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(100)
sampler = DistributedBatchSampler(dataset, batch_size=64)
for epoch in range(10):
sampler.set_epoch(epoch)
>>>
import numpy as np
>>>
from paddle.io import Dataset, DistributedBatchSampler
>>>
# init with dataset
>>>
class RandomDataset(Dataset):
...
def __init__(self, num_samples):
...
self.num_samples = num_samples
...
...
def __getitem__(self, idx):
...
image = np.random.random([784]).astype('float32')
...
label = np.random.randint(0, 9, (1, )).astype('int64')
...
return image, label
...
...
def __len__(self):
...
return self.num_samples
...
>>>
dataset = RandomDataset(100)
>>>
sampler = DistributedBatchSampler(dataset, batch_size=64)
>>>
for epoch in range(10):
...
sampler.set_epoch(epoch)
"""
self
.
epoch
=
epoch
python/paddle/io/dataloader/dataset.py
浏览文件 @
7c9b1ab6
...
...
@@ -37,26 +37,26 @@ class Dataset:
.. code-block:: python
import numpy as np
from paddle.io import Dataset
# define a random dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(10)
for i in range(len(dataset)):
print(dataset[i])
>>>
import numpy as np
>>>
from paddle.io import Dataset
>>>
# define a random dataset
>>>
class RandomDataset(Dataset):
...
def __init__(self, num_samples):
...
self.num_samples = num_samples
...
...
def __getitem__(self, idx):
...
image = np.random.random([784]).astype('float32')
...
label = np.random.randint(0, 9, (1, )).astype('int64')
...
return image, label
...
...
def __len__(self):
...
return self.num_samples
...
>>>
dataset = RandomDataset(10)
>>>
for i in range(len(dataset)):
... image, label = dataset[i]
... # do something
"""
def
__init__
(
self
):
...
...
@@ -95,23 +95,24 @@ class IterableDataset(Dataset):
.. code-block:: python
:name: code-example1
import numpy as np
from paddle.io import IterableDataset
# define a random dataset
class RandomDataset(IterableDataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __iter__(self):
for i in range(self.num_samples):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
yield image, label
dataset = RandomDataset(10)
for img, lbl in dataset:
print(img, lbl)
>>> import numpy as np
>>> from paddle.io import IterableDataset
>>> # define a random dataset
>>> class RandomDataset(IterableDataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __iter__(self):
... for i in range(self.num_samples):
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... yield image, label
...
>>> dataset = RandomDataset(10)
>>> for img, label in dataset:
... # do something
... ...
When :attr:`num_workers > 0`, each worker has a different copy of the dataset object and
will yield whole dataset samples, which means samples in dataset will be repeated in
...
...
@@ -125,87 +126,113 @@ class IterableDataset(Dataset):
.. code-block:: python
:name: code-example2
import math
import paddle
import numpy as np
from paddle.io import IterableDataset, DataLoader, get_worker_info
class SplitedIterableDataset(IterableDataset):
def __init__(self, start, end):
self.start = start
self.end = end
def __iter__(self):
worker_info = get_worker_info()
if worker_info is None:
iter_start = self.start
iter_end = self.end
else:
per_worker = int(
math.ceil((self.end - self.start) / float(
worker_info.num_workers)))
worker_id = worker_info.id
iter_start = self.start + worker_id * per_worker
iter_end = min(iter_start + per_worker, self.end)
for i in range(iter_start, iter_end):
yield np.array([i])
dataset = SplitedIterableDataset(start=2, end=9)
dataloader = DataLoader(
dataset,
num_workers=2,
batch_size=1,
drop_last=True)
for data in dataloader:
print(data)
# outputs: [2, 5, 3, 6, 4, 7]
>>> import math
>>> import paddle
>>> import numpy as np
>>> from paddle.io import IterableDataset, DataLoader, get_worker_info
>>> class SplitedIterableDataset(IterableDataset):
... def __init__(self, start, end):
... self.start = start
... self.end = end
...
... def __iter__(self):
... worker_info = get_worker_info()
... if worker_info is None:
... iter_start = self.start
... iter_end = self.end
... else:
... per_worker = int(
... math.ceil((self.end - self.start) / float(
... worker_info.num_workers)))
... worker_id = worker_info.id
... iter_start = self.start + worker_id * per_worker
... iter_end = min(iter_start + per_worker, self.end)
...
... for i in range(iter_start, iter_end):
... yield np.array([i])
...
>>> dataset = SplitedIterableDataset(start=2, end=9)
>>> dataloader = DataLoader(
... dataset,
... num_workers=2,
... batch_size=1,
... drop_last=True)
...
>>> for data in dataloader:
... print(data) # doctest: +SKIP("The output depends on the environment.")
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[2]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[3]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[4]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[5]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[6]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[7]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[8]])
splitting data copy in each worker by :code:`worker_init_fn`
.. code-block:: python
:name: code-example3
import math
import paddle
import numpy as np
from paddle.io import IterableDataset, DataLoader, get_worker_info
class RangeIterableDataset(IterableDataset):
def __init__(self, start, end):
self.start = start
self.end = end
def __iter__(self):
for i in range(self.start, self.end):
yield np.array([i])
dataset = RangeIterableDataset(start=2, end=9)
def worker_init_fn(worker_id):
worker_info = get_worker_info()
dataset = worker_info.dataset
start = dataset.start
end = dataset.end
num_per_worker = int(
math.ceil((end - start) / float(worker_info.num_workers)))
worker_id = worker_info.id
dataset.start = start + worker_id * num_per_worker
dataset.end = min(dataset.start + num_per_worker, end)
dataloader = DataLoader(
dataset,
num_workers=2,
batch_size=1,
drop_last=True,
worker_init_fn=worker_init_fn)
for data in dataloader:
print(data)
# outputs: [2, 5, 3, 6, 4, 7]
>>> import math
>>> import paddle
>>> import numpy as np
>>> from paddle.io import IterableDataset, DataLoader, get_worker_info
>>> class RangeIterableDataset(IterableDataset):
... def __init__(self, start, end):
... self.start = start
... self.end = end
...
... def __iter__(self):
... for i in range(self.start, self.end):
... yield np.array([i])
...
>>> dataset = RangeIterableDataset(start=2, end=9)
>>> def worker_init_fn(worker_id):
... worker_info = get_worker_info()
...
... dataset = worker_info.dataset
... start = dataset.start
... end = dataset.end
... num_per_worker = int(
... math.ceil((end - start) / float(worker_info.num_workers)))
...
... worker_id = worker_info.id
... dataset.start = start + worker_id * num_per_worker
... dataset.end = min(dataset.start + num_per_worker, end)
...
>>> dataloader = DataLoader(
... dataset,
... num_workers=2,
... batch_size=1,
... drop_last=True,
... worker_init_fn=worker_init_fn)
...
>>> for data in dataloader:
... print(data) # doctest: +SKIP("The output depends on the environment.")
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[2]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[3]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[4]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[5]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[6]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[7]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[8]])
"""
...
...
@@ -249,22 +276,21 @@ class TensorDataset(Dataset):
.. code-block:: python
import numpy as np
import paddle
from paddle.io import TensorDataset
>>> import numpy as np
>>> import paddle
>>> from paddle.io import TensorDataset
input_np = np.random.random([2, 3, 4]).astype('float32')
input = paddle.to_tensor(input_np)
label_np = np.random.random([2, 1]).astype('int32')
label = paddle.to_tensor(label_np)
dataset = TensorDataset([input, label])
>>> input_np = np.random.random([2, 3, 4]).astype('float32')
>>> input = paddle.to_tensor(input_np)
>>> label_np = np.random.random([2, 1]).astype('int32')
>>> label = paddle.to_tensor(label_np)
for i in range(len(dataset)):
input, label = dataset[i]
print(input, label)
>>> dataset = TensorDataset([input, label])
>>> for i in range(len(dataset)):
... input, label = dataset[i]
... # do something
"""
def
__init__
(
self
,
tensors
):
...
...
@@ -309,32 +335,28 @@ class ComposeDataset(Dataset):
.. code-block:: python
import numpy as np
import paddle
from paddle.io import Dataset, ComposeDataset
# define a random dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([32]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)])
for i in range(len(dataset)):
image1, label1, image2, label2 = dataset[i]
print(image1)
print(label1)
print(image2)
print(label2)
>>> import numpy as np
>>> import paddle
>>> from paddle.io import Dataset, ComposeDataset
>>> # define a random dataset
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([32]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)])
>>> for i in range(len(dataset)):
... image1, label1, image2, label2 = dataset[i]
... # do something
"""
def
__init__
(
self
,
datasets
):
...
...
@@ -379,25 +401,26 @@ class ChainDataset(IterableDataset):
.. code-block:: python
import numpy as np
import paddle
from paddle.io import IterableDataset, ChainDataset
# define a random dataset
class RandomDataset(IterableDataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __iter__(self):
for i in range(10):
image = np.random.random([32]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
yield image, label
dataset = ChainDataset([RandomDataset(10), RandomDataset(10)])
for image, label in iter(dataset):
print(image, label)
>>> import numpy as np
>>> import paddle
>>> from paddle.io import IterableDataset, ChainDataset
>>> # define a random dataset
>>> class RandomDataset(IterableDataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __iter__(self):
... for i in range(10):
... image = np.random.random([32]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... yield image, label
...
>>> dataset = ChainDataset([RandomDataset(10), RandomDataset(10)])
>>> for image, label in iter(dataset):
... # do something
... ...
"""
...
...
@@ -430,18 +453,18 @@ class Subset(Dataset):
.. code-block:: python
import paddle
from paddle.io import Subset
>>>
import paddle
>>>
from paddle.io import Subset
# example 1:
a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2])
print(list(a))
#
[1, 3]
>>>
# example 1:
>>>
a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2])
>>>
print(list(a))
[1, 3]
# example 2:
b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1])
print(list(b))
#
[2, 2]
>>>
# example 2:
>>>
b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1])
>>>
print(list(b))
[2, 2]
"""
def
__init__
(
self
,
dataset
,
indices
):
...
...
@@ -472,31 +495,30 @@ def random_split(dataset, lengths, generator=None):
.. code-block:: python
import paddle
from paddle.io import random_split
a_list = paddle.io.random_split(range(10), [3, 7])
print(len(a_list))
# 2
for idx, v in enumerate(a_list[0]):
print(idx, v)
# output of the first subset
# 0 1
# 1 3
# 2 9
for idx, v in enumerate(a_list[1]):
print(idx, v)
# output of the second subset
# 0 5
# 1 7
# 2 8
# 3 6
# 4 0
# 5 2
# 6 4
>>> import paddle
>>> paddle.seed(2023)
>>> a_list = paddle.io.random_split(range(10), [3, 7])
>>> print(len(a_list))
2
>>> # output of the first subset
>>> for idx, v in enumerate(a_list[0]):
... print(idx, v) # doctest: +SKIP("The output depends on the environment.")
0 7
1 6
2 5
>>> # output of the second subset
>>> for idx, v in enumerate(a_list[1]):
... print(idx, v) # doctest: +SKIP("The output depends on the environment.")
0 1
1 9
2 4
3 2
4 0
5 3
6 8
"""
# Cannot verify that dataset is Sized
if
sum
(
lengths
)
!=
len
(
dataset
):
# type: ignore
...
...
@@ -528,8 +550,12 @@ def _accumulate(iterable, fn=lambda x, y: x + y):
.. code-block:: python
_accumulate([1,2,3,4,5]) --> 1 3 6 10 15
_accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
>>> list(_accumulate([1, 2, 3, 4, 5]))
[1, 3, 6, 10, 15]
>>> import operator
>>> list(_accumulate([1, 2, 3, 4, 5], operator.mul))
[1, 2, 6, 24, 120]
"""
it
=
iter
(
iterable
)
...
...
python/paddle/io/dataloader/sampler.py
浏览文件 @
7c9b1ab6
...
...
@@ -44,34 +44,39 @@ class Sampler:
.. code-block:: python
from paddle.io import Dataset, Sampler
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
class MySampler(Sampler):
def __init__(self, data_source):
self.data_source = data_source
def __iter__(self):
return iter(range(len(self.data_source)))
def __len__(self):
return len(self.data_source)
sampler = MySampler(data_source=RandomDataset(100))
for index in sampler:
print(index)
>>> from paddle.io import Dataset, Sampler
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> class MySampler(Sampler):
... def __init__(self, data_source):
... self.data_source = data_source
...
... def __iter__(self):
... return iter(range(len(self.data_source)))
...
... def __len__(self):
... return len(self.data_source)
...
>>> sampler = MySampler(data_source=RandomDataset(100))
>>> for index in sampler:
... print(index)
0
1
2
...
99
see `paddle.io.BatchSampler`
see `paddle.io.DataLoader`
...
...
@@ -105,24 +110,29 @@ class SequenceSampler(Sampler):
.. code-block:: python
from paddle.io import Dataset, SequenceSampler
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
sampler = SequenceSampler(data_source=RandomDataset(100))
for index in sampler:
print(index)
>>> from paddle.io import Dataset, SequenceSampler
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> sampler = SequenceSampler(data_source=RandomDataset(100))
>>> for index in sampler:
... print(index)
0
1
2
...
99
see `paddle.io.Sampler`
"""
...
...
@@ -160,25 +170,31 @@ class RandomSampler(Sampler):
.. code-block:: python
from paddle.io import Dataset, RandomSampler
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
sampler = RandomSampler(data_source=RandomDataset(100))
for index in sampler:
print(index)
>>> import numpy as np
>>> from paddle.io import Dataset, RandomSampler
>>> np.random.seed(2023)
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> sampler = RandomSampler(data_source=RandomDataset(100))
>>> for index in sampler:
... print(index)
56
12
68
...
87
"""
def
__init__
(
...
...
@@ -288,14 +304,22 @@ class WeightedRandomSampler(Sampler):
.. code-block:: python
from paddle.io import WeightedRandomSampler
sampler = WeightedRandomSampler(weights=[0.1, 0.3, 0.5, 0.7, 0.2],
num_samples=5,
replacement=True)
for index in sampler:
print(index)
>>> import numpy as np
>>> from paddle.io import WeightedRandomSampler
>>> np.random.seed(2023)
>>> sampler = WeightedRandomSampler(
... weights=[0.1, 0.3, 0.5, 0.7, 0.2],
... num_samples=5,
... replacement=True
... )
>>> for index in sampler:
... print(index)
2
4
3
1
1
"""
def
__init__
(
self
,
weights
,
num_samples
,
replacement
=
True
):
...
...
python/paddle/io/dataloader/worker.py
浏览文件 @
7c9b1ab6
...
...
@@ -13,8 +13,6 @@
# limitations under the License.
import
os
# NOTE: queue has a different name in python2 and python3
import
queue
import
sys
import
traceback
...
...
@@ -94,51 +92,64 @@ def get_worker_info():
Returns:
WorkerInfo: an instance of WorkerInfo which contains fields above.
.. note:
:
Notes
:
For more usage and examples, please see :code:`paddle.io.IterableDataset`
Example:
.. code-block:: python
import math
import paddle
import numpy as np
from paddle.io import IterableDataset, DataLoader, get_worker_info
class SplitedIterableDataset(IterableDataset):
def __init__(self, start, end):
self.start = start
self.end = end
def __iter__(self):
worker_info = get_worker_info()
if worker_info is None:
iter_start = self.start
iter_end = self.end
else:
per_worker = int(
math.ceil((self.end - self.start) / float(
worker_info.num_workers)))
worker_id = worker_info.id
iter_start = self.start + worker_id * per_worker
iter_end = min(iter_start + per_worker, self.end)
for i in range(iter_start, iter_end):
yield np.array([i])
place = paddle.CPUPlace()
dataset = SplitedIterableDataset(start=2, end=9)
dataloader = DataLoader(
dataset,
places=place,
num_workers=2,
batch_size=1,
drop_last=True)
for data in dataloader:
print(data)
# outputs: [2, 5, 3, 6, 4, 7]
>>> import math
>>> import paddle
>>> import numpy as np
>>> from paddle.io import IterableDataset, DataLoader, get_worker_info
>>> class SplitedIterableDataset(IterableDataset):
... def __init__(self, start, end):
... self.start = start
... self.end = end
...
... def __iter__(self):
... worker_info = get_worker_info()
... if worker_info is None:
... iter_start = self.start
... iter_end = self.end
... else:
... per_worker = int(
... math.ceil((self.end - self.start) / float(
... worker_info.num_workers)))
... worker_id = worker_info.id
... iter_start = self.start + worker_id * per_worker
... iter_end = min(iter_start + per_worker, self.end)
...
... for i in range(iter_start, iter_end):
... yield np.array([i])
...
>>> place = paddle.CPUPlace()
>>> dataset = SplitedIterableDataset(start=2, end=9)
>>> dataloader = DataLoader(
... dataset,
... places=place,
... num_workers=2,
... batch_size=1,
... drop_last=True)
...
>>> for data in dataloader:
... print(data) # doctest: +SKIP("The output depends on the environment.")
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[2]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[6]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[3]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[7]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[4]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[8]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[5]])
"""
return
_worker_info
...
...
python/paddle/io/multiprocess_utils.py
浏览文件 @
7c9b1ab6
...
...
@@ -13,8 +13,6 @@
# limitations under the License.
import
atexit
# NOTE: queue has a different name in python2 and python3
import
queue
import
signal
import
sys
...
...
python/paddle/io/reader.py
浏览文件 @
7c9b1ab6
...
...
@@ -14,8 +14,6 @@
import
copy
import
multiprocessing
# NOTE: queue has a different name in python2 and python3
import
sys
import
time
import
warnings
...
...
@@ -234,7 +232,7 @@ class DataLoader:
For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler`
.. note:
:
Notes
:
GPU tensor operation is not supported in subprocess currently,
please don't use GPU tensor operations in pipeline which will
be performed in subprocess, such as dataset transforms, collte_fn,
...
...
@@ -250,7 +248,7 @@ class DataLoader:
:attr:`collate_fn` or :attr:`default_collate_fn`.
.. note:
:
Notes
:
When automatic batching is disabled, :attr:`default_collate_fn` will
do nothing to data from dataset.
...
...
@@ -321,68 +319,66 @@ class DataLoader:
.. code-block:: python
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.io import Dataset, BatchSampler, DataLoader
BATCH_NUM = 20
BATCH_SIZE = 16
EPOCH_NUM = 4
IMAGE_SIZE = 784
CLASS_NUM = 10
# define a random dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([IMAGE_SIZE]).astype('float32')
label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
class SimpleNet(nn.Layer):
def __init__(self):
super().__init__()
self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)
def forward(self, image, label=None):
return self.fc(image)
simple_net = SimpleNet()
opt = paddle.optimizer.SGD(learning_rate=1e-3,
parameters=simple_net.parameters())
loader = DataLoader(dataset,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
num_workers=2)
for e in range(EPOCH_NUM):
for i, (image, label) in enumerate(loader()):
out = simple_net(image)
loss = F.cross_entropy(out, label)
avg_loss = paddle.mean(loss)
avg_loss.backward()
opt.minimize(avg_loss)
simple_net.clear_gradients()
print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
.. note::
>>> import numpy as np
>>> import paddle
>>> import paddle.nn as nn
>>> import paddle.nn.functional as F
>>> from paddle.io import Dataset, BatchSampler, DataLoader
>>> BATCH_NUM = 20
>>> BATCH_SIZE = 16
>>> EPOCH_NUM = 4
>>> IMAGE_SIZE = 784
>>> CLASS_NUM = 10
>>> # define a random dataset
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([IMAGE_SIZE]).astype('float32')
... label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
>>> class SimpleNet(nn.Layer):
... def __init__(self):
... super().__init__()
... self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)
...
... def forward(self, image, label=None):
... return self.fc(image)
...
>>> simple_net = SimpleNet()
>>> opt = paddle.optimizer.SGD(learning_rate=1e-3,
... parameters=simple_net.parameters())
...
>>> loader = DataLoader(dataset,
... batch_size=BATCH_SIZE,
... shuffle=True,
... drop_last=True,
... num_workers=2)
...
>>> for e in range(EPOCH_NUM):
... for i, (image, label) in enumerate(loader()):
... out = simple_net(image)
... loss = F.cross_entropy(out, label)
... avg_loss = paddle.mean(loss)
... avg_loss.backward()
... opt.minimize(avg_loss)
... simple_net.clear_gradients()
... print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
Notes:
For reading iterable dataset with multiprocess Dataloader,
please see :code:`paddle.io.IterableDataset`
"""
def
__init__
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录