未验证 提交 7c9b1ab6 编写于 作者: N Nyakku Shigure 提交者: GitHub

[xdoctest] reformat example code with google style in `paddle/io` (#55732)

上级 ff226ba1
......@@ -58,40 +58,44 @@ class BatchSampler(Sampler):
.. code-block:: python
from paddle.io import RandomSampler, BatchSampler, Dataset
# init with dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
bs = BatchSampler(dataset=RandomDataset(100),
shuffle=False,
batch_size=16,
drop_last=False)
for batch_indices in bs:
print(batch_indices)
# init with sampler
sampler = RandomSampler(RandomDataset(100))
bs = BatchSampler(sampler=sampler,
batch_size=8,
drop_last=True)
for batch_indices in bs:
print(batch_indices)
>>> import numpy as np
>>> from paddle.io import RandomSampler, BatchSampler, Dataset
>>> np.random.seed(2023)
>>> # init with dataset
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> bs = BatchSampler(dataset=RandomDataset(100),
... shuffle=False,
... batch_size=16,
... drop_last=False)
...
>>> for batch_indices in bs:
... print(batch_indices)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
...
[96, 97, 98, 99]
>>> # init with sampler
>>> sampler = RandomSampler(RandomDataset(100))
>>> bs = BatchSampler(sampler=sampler,
... batch_size=8,
... drop_last=True)
...
>>> for batch_indices in bs:
... print(batch_indices)
[56, 12, 68, 0, 82, 66, 91, 44]
...
[53, 17, 22, 86, 52, 3, 92, 33]
"""
def __init__(
......@@ -203,29 +207,29 @@ class DistributedBatchSampler(BatchSampler):
Examples:
.. code-block:: python
import numpy as np
from paddle.io import Dataset, DistributedBatchSampler
# init with dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(100)
sampler = DistributedBatchSampler(dataset, batch_size=64)
for data in sampler:
# do something
break
>>> import numpy as np
>>> from paddle.io import Dataset, DistributedBatchSampler
>>> # init with dataset
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> dataset = RandomDataset(100)
>>> sampler = DistributedBatchSampler(dataset, batch_size=64)
>>> for data in sampler:
... # do something
... break
"""
def __init__(
......@@ -339,27 +343,27 @@ class DistributedBatchSampler(BatchSampler):
Examples:
.. code-block:: python
import numpy as np
from paddle.io import Dataset, DistributedBatchSampler
# init with dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(100)
sampler = DistributedBatchSampler(dataset, batch_size=64)
for epoch in range(10):
sampler.set_epoch(epoch)
>>> import numpy as np
>>> from paddle.io import Dataset, DistributedBatchSampler
>>> # init with dataset
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> dataset = RandomDataset(100)
>>> sampler = DistributedBatchSampler(dataset, batch_size=64)
>>> for epoch in range(10):
... sampler.set_epoch(epoch)
"""
self.epoch = epoch
......@@ -37,26 +37,26 @@ class Dataset:
.. code-block:: python
import numpy as np
from paddle.io import Dataset
# define a random dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(10)
for i in range(len(dataset)):
print(dataset[i])
>>> import numpy as np
>>> from paddle.io import Dataset
>>> # define a random dataset
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> dataset = RandomDataset(10)
>>> for i in range(len(dataset)):
... image, label = dataset[i]
... # do something
"""
def __init__(self):
......@@ -95,23 +95,24 @@ class IterableDataset(Dataset):
.. code-block:: python
:name: code-example1
import numpy as np
from paddle.io import IterableDataset
# define a random dataset
class RandomDataset(IterableDataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __iter__(self):
for i in range(self.num_samples):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
yield image, label
dataset = RandomDataset(10)
for img, lbl in dataset:
print(img, lbl)
>>> import numpy as np
>>> from paddle.io import IterableDataset
>>> # define a random dataset
>>> class RandomDataset(IterableDataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __iter__(self):
... for i in range(self.num_samples):
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... yield image, label
...
>>> dataset = RandomDataset(10)
>>> for img, label in dataset:
... # do something
... ...
When :attr:`num_workers > 0`, each worker has a different copy of the dataset object and
will yield whole dataset samples, which means samples in dataset will be repeated in
......@@ -125,87 +126,113 @@ class IterableDataset(Dataset):
.. code-block:: python
:name: code-example2
import math
import paddle
import numpy as np
from paddle.io import IterableDataset, DataLoader, get_worker_info
class SplitedIterableDataset(IterableDataset):
def __init__(self, start, end):
self.start = start
self.end = end
def __iter__(self):
worker_info = get_worker_info()
if worker_info is None:
iter_start = self.start
iter_end = self.end
else:
per_worker = int(
math.ceil((self.end - self.start) / float(
worker_info.num_workers)))
worker_id = worker_info.id
iter_start = self.start + worker_id * per_worker
iter_end = min(iter_start + per_worker, self.end)
for i in range(iter_start, iter_end):
yield np.array([i])
dataset = SplitedIterableDataset(start=2, end=9)
dataloader = DataLoader(
dataset,
num_workers=2,
batch_size=1,
drop_last=True)
for data in dataloader:
print(data)
# outputs: [2, 5, 3, 6, 4, 7]
>>> import math
>>> import paddle
>>> import numpy as np
>>> from paddle.io import IterableDataset, DataLoader, get_worker_info
>>> class SplitedIterableDataset(IterableDataset):
... def __init__(self, start, end):
... self.start = start
... self.end = end
...
... def __iter__(self):
... worker_info = get_worker_info()
... if worker_info is None:
... iter_start = self.start
... iter_end = self.end
... else:
... per_worker = int(
... math.ceil((self.end - self.start) / float(
... worker_info.num_workers)))
... worker_id = worker_info.id
... iter_start = self.start + worker_id * per_worker
... iter_end = min(iter_start + per_worker, self.end)
...
... for i in range(iter_start, iter_end):
... yield np.array([i])
...
>>> dataset = SplitedIterableDataset(start=2, end=9)
>>> dataloader = DataLoader(
... dataset,
... num_workers=2,
... batch_size=1,
... drop_last=True)
...
>>> for data in dataloader:
... print(data) # doctest: +SKIP("The output depends on the environment.")
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[2]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[3]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[4]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[5]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[6]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[7]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[8]])
splitting data copy in each worker by :code:`worker_init_fn`
.. code-block:: python
:name: code-example3
import math
import paddle
import numpy as np
from paddle.io import IterableDataset, DataLoader, get_worker_info
class RangeIterableDataset(IterableDataset):
def __init__(self, start, end):
self.start = start
self.end = end
def __iter__(self):
for i in range(self.start, self.end):
yield np.array([i])
dataset = RangeIterableDataset(start=2, end=9)
def worker_init_fn(worker_id):
worker_info = get_worker_info()
dataset = worker_info.dataset
start = dataset.start
end = dataset.end
num_per_worker = int(
math.ceil((end - start) / float(worker_info.num_workers)))
worker_id = worker_info.id
dataset.start = start + worker_id * num_per_worker
dataset.end = min(dataset.start + num_per_worker, end)
dataloader = DataLoader(
dataset,
num_workers=2,
batch_size=1,
drop_last=True,
worker_init_fn=worker_init_fn)
for data in dataloader:
print(data)
# outputs: [2, 5, 3, 6, 4, 7]
>>> import math
>>> import paddle
>>> import numpy as np
>>> from paddle.io import IterableDataset, DataLoader, get_worker_info
>>> class RangeIterableDataset(IterableDataset):
... def __init__(self, start, end):
... self.start = start
... self.end = end
...
... def __iter__(self):
... for i in range(self.start, self.end):
... yield np.array([i])
...
>>> dataset = RangeIterableDataset(start=2, end=9)
>>> def worker_init_fn(worker_id):
... worker_info = get_worker_info()
...
... dataset = worker_info.dataset
... start = dataset.start
... end = dataset.end
... num_per_worker = int(
... math.ceil((end - start) / float(worker_info.num_workers)))
...
... worker_id = worker_info.id
... dataset.start = start + worker_id * num_per_worker
... dataset.end = min(dataset.start + num_per_worker, end)
...
>>> dataloader = DataLoader(
... dataset,
... num_workers=2,
... batch_size=1,
... drop_last=True,
... worker_init_fn=worker_init_fn)
...
>>> for data in dataloader:
... print(data) # doctest: +SKIP("The output depends on the environment.")
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[2]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[3]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[4]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[5]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[6]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[7]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[8]])
"""
......@@ -249,22 +276,21 @@ class TensorDataset(Dataset):
.. code-block:: python
import numpy as np
import paddle
from paddle.io import TensorDataset
>>> import numpy as np
>>> import paddle
>>> from paddle.io import TensorDataset
input_np = np.random.random([2, 3, 4]).astype('float32')
input = paddle.to_tensor(input_np)
label_np = np.random.random([2, 1]).astype('int32')
label = paddle.to_tensor(label_np)
dataset = TensorDataset([input, label])
>>> input_np = np.random.random([2, 3, 4]).astype('float32')
>>> input = paddle.to_tensor(input_np)
>>> label_np = np.random.random([2, 1]).astype('int32')
>>> label = paddle.to_tensor(label_np)
for i in range(len(dataset)):
input, label = dataset[i]
print(input, label)
>>> dataset = TensorDataset([input, label])
>>> for i in range(len(dataset)):
... input, label = dataset[i]
... # do something
"""
def __init__(self, tensors):
......@@ -309,32 +335,28 @@ class ComposeDataset(Dataset):
.. code-block:: python
import numpy as np
import paddle
from paddle.io import Dataset, ComposeDataset
# define a random dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([32]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)])
for i in range(len(dataset)):
image1, label1, image2, label2 = dataset[i]
print(image1)
print(label1)
print(image2)
print(label2)
>>> import numpy as np
>>> import paddle
>>> from paddle.io import Dataset, ComposeDataset
>>> # define a random dataset
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([32]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)])
>>> for i in range(len(dataset)):
... image1, label1, image2, label2 = dataset[i]
... # do something
"""
def __init__(self, datasets):
......@@ -379,25 +401,26 @@ class ChainDataset(IterableDataset):
.. code-block:: python
import numpy as np
import paddle
from paddle.io import IterableDataset, ChainDataset
# define a random dataset
class RandomDataset(IterableDataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __iter__(self):
for i in range(10):
image = np.random.random([32]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
yield image, label
dataset = ChainDataset([RandomDataset(10), RandomDataset(10)])
for image, label in iter(dataset):
print(image, label)
>>> import numpy as np
>>> import paddle
>>> from paddle.io import IterableDataset, ChainDataset
>>> # define a random dataset
>>> class RandomDataset(IterableDataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __iter__(self):
... for i in range(10):
... image = np.random.random([32]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... yield image, label
...
>>> dataset = ChainDataset([RandomDataset(10), RandomDataset(10)])
>>> for image, label in iter(dataset):
... # do something
... ...
"""
......@@ -430,18 +453,18 @@ class Subset(Dataset):
.. code-block:: python
import paddle
from paddle.io import Subset
>>> import paddle
>>> from paddle.io import Subset
# example 1:
a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2])
print(list(a))
# [1, 3]
>>> # example 1:
>>> a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2])
>>> print(list(a))
[1, 3]
# example 2:
b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1])
print(list(b))
# [2, 2]
>>> # example 2:
>>> b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1])
>>> print(list(b))
[2, 2]
"""
def __init__(self, dataset, indices):
......@@ -472,31 +495,30 @@ def random_split(dataset, lengths, generator=None):
.. code-block:: python
import paddle
from paddle.io import random_split
a_list = paddle.io.random_split(range(10), [3, 7])
print(len(a_list))
# 2
for idx, v in enumerate(a_list[0]):
print(idx, v)
# output of the first subset
# 0 1
# 1 3
# 2 9
for idx, v in enumerate(a_list[1]):
print(idx, v)
# output of the second subset
# 0 5
# 1 7
# 2 8
# 3 6
# 4 0
# 5 2
# 6 4
>>> import paddle
>>> paddle.seed(2023)
>>> a_list = paddle.io.random_split(range(10), [3, 7])
>>> print(len(a_list))
2
>>> # output of the first subset
>>> for idx, v in enumerate(a_list[0]):
... print(idx, v) # doctest: +SKIP("The output depends on the environment.")
0 7
1 6
2 5
>>> # output of the second subset
>>> for idx, v in enumerate(a_list[1]):
... print(idx, v) # doctest: +SKIP("The output depends on the environment.")
0 1
1 9
2 4
3 2
4 0
5 3
6 8
"""
# Cannot verify that dataset is Sized
if sum(lengths) != len(dataset): # type: ignore
......@@ -528,8 +550,12 @@ def _accumulate(iterable, fn=lambda x, y: x + y):
.. code-block:: python
_accumulate([1,2,3,4,5]) --> 1 3 6 10 15
_accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
>>> list(_accumulate([1, 2, 3, 4, 5]))
[1, 3, 6, 10, 15]
>>> import operator
>>> list(_accumulate([1, 2, 3, 4, 5], operator.mul))
[1, 2, 6, 24, 120]
"""
it = iter(iterable)
......
......@@ -44,34 +44,39 @@ class Sampler:
.. code-block:: python
from paddle.io import Dataset, Sampler
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
class MySampler(Sampler):
def __init__(self, data_source):
self.data_source = data_source
def __iter__(self):
return iter(range(len(self.data_source)))
def __len__(self):
return len(self.data_source)
sampler = MySampler(data_source=RandomDataset(100))
for index in sampler:
print(index)
>>> from paddle.io import Dataset, Sampler
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> class MySampler(Sampler):
... def __init__(self, data_source):
... self.data_source = data_source
...
... def __iter__(self):
... return iter(range(len(self.data_source)))
...
... def __len__(self):
... return len(self.data_source)
...
>>> sampler = MySampler(data_source=RandomDataset(100))
>>> for index in sampler:
... print(index)
0
1
2
...
99
see `paddle.io.BatchSampler`
see `paddle.io.DataLoader`
......@@ -105,24 +110,29 @@ class SequenceSampler(Sampler):
.. code-block:: python
from paddle.io import Dataset, SequenceSampler
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
sampler = SequenceSampler(data_source=RandomDataset(100))
for index in sampler:
print(index)
>>> from paddle.io import Dataset, SequenceSampler
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> sampler = SequenceSampler(data_source=RandomDataset(100))
>>> for index in sampler:
... print(index)
0
1
2
...
99
see `paddle.io.Sampler`
"""
......@@ -160,25 +170,31 @@ class RandomSampler(Sampler):
.. code-block:: python
from paddle.io import Dataset, RandomSampler
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
sampler = RandomSampler(data_source=RandomDataset(100))
for index in sampler:
print(index)
>>> import numpy as np
>>> from paddle.io import Dataset, RandomSampler
>>> np.random.seed(2023)
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> sampler = RandomSampler(data_source=RandomDataset(100))
>>> for index in sampler:
... print(index)
56
12
68
...
87
"""
def __init__(
......@@ -288,14 +304,22 @@ class WeightedRandomSampler(Sampler):
.. code-block:: python
from paddle.io import WeightedRandomSampler
sampler = WeightedRandomSampler(weights=[0.1, 0.3, 0.5, 0.7, 0.2],
num_samples=5,
replacement=True)
for index in sampler:
print(index)
>>> import numpy as np
>>> from paddle.io import WeightedRandomSampler
>>> np.random.seed(2023)
>>> sampler = WeightedRandomSampler(
... weights=[0.1, 0.3, 0.5, 0.7, 0.2],
... num_samples=5,
... replacement=True
... )
>>> for index in sampler:
... print(index)
2
4
3
1
1
"""
def __init__(self, weights, num_samples, replacement=True):
......
......@@ -13,8 +13,6 @@
# limitations under the License.
import os
# NOTE: queue has a different name in python2 and python3
import queue
import sys
import traceback
......@@ -94,51 +92,64 @@ def get_worker_info():
Returns:
WorkerInfo: an instance of WorkerInfo which contains fields above.
.. note::
Notes:
For more usage and examples, please see :code:`paddle.io.IterableDataset`
Example:
.. code-block:: python
import math
import paddle
import numpy as np
from paddle.io import IterableDataset, DataLoader, get_worker_info
class SplitedIterableDataset(IterableDataset):
def __init__(self, start, end):
self.start = start
self.end = end
def __iter__(self):
worker_info = get_worker_info()
if worker_info is None:
iter_start = self.start
iter_end = self.end
else:
per_worker = int(
math.ceil((self.end - self.start) / float(
worker_info.num_workers)))
worker_id = worker_info.id
iter_start = self.start + worker_id * per_worker
iter_end = min(iter_start + per_worker, self.end)
for i in range(iter_start, iter_end):
yield np.array([i])
place = paddle.CPUPlace()
dataset = SplitedIterableDataset(start=2, end=9)
dataloader = DataLoader(
dataset,
places=place,
num_workers=2,
batch_size=1,
drop_last=True)
for data in dataloader:
print(data)
# outputs: [2, 5, 3, 6, 4, 7]
>>> import math
>>> import paddle
>>> import numpy as np
>>> from paddle.io import IterableDataset, DataLoader, get_worker_info
>>> class SplitedIterableDataset(IterableDataset):
... def __init__(self, start, end):
... self.start = start
... self.end = end
...
... def __iter__(self):
... worker_info = get_worker_info()
... if worker_info is None:
... iter_start = self.start
... iter_end = self.end
... else:
... per_worker = int(
... math.ceil((self.end - self.start) / float(
... worker_info.num_workers)))
... worker_id = worker_info.id
... iter_start = self.start + worker_id * per_worker
... iter_end = min(iter_start + per_worker, self.end)
...
... for i in range(iter_start, iter_end):
... yield np.array([i])
...
>>> place = paddle.CPUPlace()
>>> dataset = SplitedIterableDataset(start=2, end=9)
>>> dataloader = DataLoader(
... dataset,
... places=place,
... num_workers=2,
... batch_size=1,
... drop_last=True)
...
>>> for data in dataloader:
... print(data) # doctest: +SKIP("The output depends on the environment.")
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[2]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[6]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[3]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[7]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[4]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[8]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[5]])
"""
return _worker_info
......
......@@ -13,8 +13,6 @@
# limitations under the License.
import atexit
# NOTE: queue has a different name in python2 and python3
import queue
import signal
import sys
......
......@@ -14,8 +14,6 @@
import copy
import multiprocessing
# NOTE: queue has a different name in python2 and python3
import sys
import time
import warnings
......@@ -234,7 +232,7 @@ class DataLoader:
For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler`
.. note::
Notes:
GPU tensor operation is not supported in subprocess currently,
please don't use GPU tensor operations in pipeline which will
be performed in subprocess, such as dataset transforms, collte_fn,
......@@ -250,7 +248,7 @@ class DataLoader:
:attr:`collate_fn` or :attr:`default_collate_fn`.
.. note::
Notes:
When automatic batching is disabled, :attr:`default_collate_fn` will
do nothing to data from dataset.
......@@ -321,68 +319,66 @@ class DataLoader:
.. code-block:: python
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.io import Dataset, BatchSampler, DataLoader
BATCH_NUM = 20
BATCH_SIZE = 16
EPOCH_NUM = 4
IMAGE_SIZE = 784
CLASS_NUM = 10
# define a random dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([IMAGE_SIZE]).astype('float32')
label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
class SimpleNet(nn.Layer):
def __init__(self):
super().__init__()
self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)
def forward(self, image, label=None):
return self.fc(image)
simple_net = SimpleNet()
opt = paddle.optimizer.SGD(learning_rate=1e-3,
parameters=simple_net.parameters())
loader = DataLoader(dataset,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
num_workers=2)
for e in range(EPOCH_NUM):
for i, (image, label) in enumerate(loader()):
out = simple_net(image)
loss = F.cross_entropy(out, label)
avg_loss = paddle.mean(loss)
avg_loss.backward()
opt.minimize(avg_loss)
simple_net.clear_gradients()
print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
.. note::
>>> import numpy as np
>>> import paddle
>>> import paddle.nn as nn
>>> import paddle.nn.functional as F
>>> from paddle.io import Dataset, BatchSampler, DataLoader
>>> BATCH_NUM = 20
>>> BATCH_SIZE = 16
>>> EPOCH_NUM = 4
>>> IMAGE_SIZE = 784
>>> CLASS_NUM = 10
>>> # define a random dataset
>>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
... self.num_samples = num_samples
...
... def __getitem__(self, idx):
... image = np.random.random([IMAGE_SIZE]).astype('float32')
... label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
... return image, label
...
... def __len__(self):
... return self.num_samples
...
>>> dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
>>> class SimpleNet(nn.Layer):
... def __init__(self):
... super().__init__()
... self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)
...
... def forward(self, image, label=None):
... return self.fc(image)
...
>>> simple_net = SimpleNet()
>>> opt = paddle.optimizer.SGD(learning_rate=1e-3,
... parameters=simple_net.parameters())
...
>>> loader = DataLoader(dataset,
... batch_size=BATCH_SIZE,
... shuffle=True,
... drop_last=True,
... num_workers=2)
...
>>> for e in range(EPOCH_NUM):
... for i, (image, label) in enumerate(loader()):
... out = simple_net(image)
... loss = F.cross_entropy(out, label)
... avg_loss = paddle.mean(loss)
... avg_loss.backward()
... opt.minimize(avg_loss)
... simple_net.clear_gradients()
... print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
Notes:
For reading iterable dataset with multiprocess Dataloader,
please see :code:`paddle.io.IterableDataset`
"""
def __init__(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册