未验证 提交 7c9b1ab6 编写于 作者: N Nyakku Shigure 提交者: GitHub

[xdoctest] reformat example code with google style in `paddle/io` (#55732)

上级 ff226ba1
...@@ -58,40 +58,44 @@ class BatchSampler(Sampler): ...@@ -58,40 +58,44 @@ class BatchSampler(Sampler):
.. code-block:: python .. code-block:: python
from paddle.io import RandomSampler, BatchSampler, Dataset >>> import numpy as np
>>> from paddle.io import RandomSampler, BatchSampler, Dataset
# init with dataset
class RandomDataset(Dataset): >>> np.random.seed(2023)
def __init__(self, num_samples): >>> # init with dataset
self.num_samples = num_samples >>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
def __getitem__(self, idx): ... self.num_samples = num_samples
image = np.random.random([784]).astype('float32') ...
label = np.random.randint(0, 9, (1, )).astype('int64') ... def __getitem__(self, idx):
return image, label ... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
def __len__(self): ... return image, label
return self.num_samples ...
... def __len__(self):
bs = BatchSampler(dataset=RandomDataset(100), ... return self.num_samples
shuffle=False, ...
batch_size=16, >>> bs = BatchSampler(dataset=RandomDataset(100),
drop_last=False) ... shuffle=False,
... batch_size=16,
for batch_indices in bs: ... drop_last=False)
print(batch_indices) ...
>>> for batch_indices in bs:
# init with sampler ... print(batch_indices)
sampler = RandomSampler(RandomDataset(100)) [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
bs = BatchSampler(sampler=sampler, ...
batch_size=8, [96, 97, 98, 99]
drop_last=True) >>> # init with sampler
>>> sampler = RandomSampler(RandomDataset(100))
for batch_indices in bs: >>> bs = BatchSampler(sampler=sampler,
print(batch_indices) ... batch_size=8,
... drop_last=True)
...
>>> for batch_indices in bs:
... print(batch_indices)
[56, 12, 68, 0, 82, 66, 91, 44]
...
[53, 17, 22, 86, 52, 3, 92, 33]
""" """
def __init__( def __init__(
...@@ -203,29 +207,29 @@ class DistributedBatchSampler(BatchSampler): ...@@ -203,29 +207,29 @@ class DistributedBatchSampler(BatchSampler):
Examples: Examples:
.. code-block:: python .. code-block:: python
import numpy as np >>> import numpy as np
from paddle.io import Dataset, DistributedBatchSampler >>> from paddle.io import Dataset, DistributedBatchSampler
# init with dataset >>> # init with dataset
class RandomDataset(Dataset): >>> class RandomDataset(Dataset):
def __init__(self, num_samples): ... def __init__(self, num_samples):
self.num_samples = num_samples ... self.num_samples = num_samples
...
def __getitem__(self, idx): ... def __getitem__(self, idx):
image = np.random.random([784]).astype('float32') ... image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64') ... label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label ... return image, label
...
def __len__(self): ... def __len__(self):
return self.num_samples ... return self.num_samples
...
dataset = RandomDataset(100) >>> dataset = RandomDataset(100)
sampler = DistributedBatchSampler(dataset, batch_size=64) >>> sampler = DistributedBatchSampler(dataset, batch_size=64)
for data in sampler: >>> for data in sampler:
# do something ... # do something
break ... break
""" """
def __init__( def __init__(
...@@ -339,27 +343,27 @@ class DistributedBatchSampler(BatchSampler): ...@@ -339,27 +343,27 @@ class DistributedBatchSampler(BatchSampler):
Examples: Examples:
.. code-block:: python .. code-block:: python
import numpy as np >>> import numpy as np
from paddle.io import Dataset, DistributedBatchSampler >>> from paddle.io import Dataset, DistributedBatchSampler
# init with dataset >>> # init with dataset
class RandomDataset(Dataset): >>> class RandomDataset(Dataset):
def __init__(self, num_samples): ... def __init__(self, num_samples):
self.num_samples = num_samples ... self.num_samples = num_samples
...
def __getitem__(self, idx): ... def __getitem__(self, idx):
image = np.random.random([784]).astype('float32') ... image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64') ... label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label ... return image, label
...
def __len__(self): ... def __len__(self):
return self.num_samples ... return self.num_samples
...
dataset = RandomDataset(100) >>> dataset = RandomDataset(100)
sampler = DistributedBatchSampler(dataset, batch_size=64) >>> sampler = DistributedBatchSampler(dataset, batch_size=64)
for epoch in range(10): >>> for epoch in range(10):
sampler.set_epoch(epoch) ... sampler.set_epoch(epoch)
""" """
self.epoch = epoch self.epoch = epoch
...@@ -37,26 +37,26 @@ class Dataset: ...@@ -37,26 +37,26 @@ class Dataset:
.. code-block:: python .. code-block:: python
import numpy as np >>> import numpy as np
from paddle.io import Dataset >>> from paddle.io import Dataset
# define a random dataset >>> # define a random dataset
class RandomDataset(Dataset): >>> class RandomDataset(Dataset):
def __init__(self, num_samples): ... def __init__(self, num_samples):
self.num_samples = num_samples ... self.num_samples = num_samples
...
def __getitem__(self, idx): ... def __getitem__(self, idx):
image = np.random.random([784]).astype('float32') ... image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64') ... label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label ... return image, label
...
def __len__(self): ... def __len__(self):
return self.num_samples ... return self.num_samples
...
dataset = RandomDataset(10) >>> dataset = RandomDataset(10)
for i in range(len(dataset)): >>> for i in range(len(dataset)):
print(dataset[i]) ... image, label = dataset[i]
... # do something
""" """
def __init__(self): def __init__(self):
...@@ -95,23 +95,24 @@ class IterableDataset(Dataset): ...@@ -95,23 +95,24 @@ class IterableDataset(Dataset):
.. code-block:: python .. code-block:: python
:name: code-example1 :name: code-example1
import numpy as np >>> import numpy as np
from paddle.io import IterableDataset >>> from paddle.io import IterableDataset
# define a random dataset >>> # define a random dataset
class RandomDataset(IterableDataset): >>> class RandomDataset(IterableDataset):
def __init__(self, num_samples): ... def __init__(self, num_samples):
self.num_samples = num_samples ... self.num_samples = num_samples
...
def __iter__(self): ... def __iter__(self):
for i in range(self.num_samples): ... for i in range(self.num_samples):
image = np.random.random([784]).astype('float32') ... image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64') ... label = np.random.randint(0, 9, (1, )).astype('int64')
yield image, label ... yield image, label
...
dataset = RandomDataset(10) >>> dataset = RandomDataset(10)
for img, lbl in dataset: >>> for img, label in dataset:
print(img, lbl) ... # do something
... ...
When :attr:`num_workers > 0`, each worker has a different copy of the dataset object and When :attr:`num_workers > 0`, each worker has a different copy of the dataset object and
will yield whole dataset samples, which means samples in dataset will be repeated in will yield whole dataset samples, which means samples in dataset will be repeated in
...@@ -125,87 +126,113 @@ class IterableDataset(Dataset): ...@@ -125,87 +126,113 @@ class IterableDataset(Dataset):
.. code-block:: python .. code-block:: python
:name: code-example2 :name: code-example2
import math >>> import math
import paddle >>> import paddle
import numpy as np >>> import numpy as np
from paddle.io import IterableDataset, DataLoader, get_worker_info >>> from paddle.io import IterableDataset, DataLoader, get_worker_info
class SplitedIterableDataset(IterableDataset): >>> class SplitedIterableDataset(IterableDataset):
def __init__(self, start, end): ... def __init__(self, start, end):
self.start = start ... self.start = start
self.end = end ... self.end = end
...
def __iter__(self): ... def __iter__(self):
worker_info = get_worker_info() ... worker_info = get_worker_info()
if worker_info is None: ... if worker_info is None:
iter_start = self.start ... iter_start = self.start
iter_end = self.end ... iter_end = self.end
else: ... else:
per_worker = int( ... per_worker = int(
math.ceil((self.end - self.start) / float( ... math.ceil((self.end - self.start) / float(
worker_info.num_workers))) ... worker_info.num_workers)))
worker_id = worker_info.id ... worker_id = worker_info.id
iter_start = self.start + worker_id * per_worker ... iter_start = self.start + worker_id * per_worker
iter_end = min(iter_start + per_worker, self.end) ... iter_end = min(iter_start + per_worker, self.end)
...
for i in range(iter_start, iter_end): ... for i in range(iter_start, iter_end):
yield np.array([i]) ... yield np.array([i])
...
dataset = SplitedIterableDataset(start=2, end=9) >>> dataset = SplitedIterableDataset(start=2, end=9)
dataloader = DataLoader( >>> dataloader = DataLoader(
dataset, ... dataset,
num_workers=2, ... num_workers=2,
batch_size=1, ... batch_size=1,
drop_last=True) ... drop_last=True)
...
for data in dataloader: >>> for data in dataloader:
print(data) ... print(data) # doctest: +SKIP("The output depends on the environment.")
# outputs: [2, 5, 3, 6, 4, 7] Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[2]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[3]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[4]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[5]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[6]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[7]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[8]])
splitting data copy in each worker by :code:`worker_init_fn` splitting data copy in each worker by :code:`worker_init_fn`
.. code-block:: python .. code-block:: python
:name: code-example3 :name: code-example3
import math >>> import math
import paddle >>> import paddle
import numpy as np >>> import numpy as np
from paddle.io import IterableDataset, DataLoader, get_worker_info >>> from paddle.io import IterableDataset, DataLoader, get_worker_info
class RangeIterableDataset(IterableDataset): >>> class RangeIterableDataset(IterableDataset):
def __init__(self, start, end): ... def __init__(self, start, end):
self.start = start ... self.start = start
self.end = end ... self.end = end
...
def __iter__(self): ... def __iter__(self):
for i in range(self.start, self.end): ... for i in range(self.start, self.end):
yield np.array([i]) ... yield np.array([i])
...
dataset = RangeIterableDataset(start=2, end=9) >>> dataset = RangeIterableDataset(start=2, end=9)
def worker_init_fn(worker_id): >>> def worker_init_fn(worker_id):
worker_info = get_worker_info() ... worker_info = get_worker_info()
...
dataset = worker_info.dataset ... dataset = worker_info.dataset
start = dataset.start ... start = dataset.start
end = dataset.end ... end = dataset.end
num_per_worker = int( ... num_per_worker = int(
math.ceil((end - start) / float(worker_info.num_workers))) ... math.ceil((end - start) / float(worker_info.num_workers)))
...
worker_id = worker_info.id ... worker_id = worker_info.id
dataset.start = start + worker_id * num_per_worker ... dataset.start = start + worker_id * num_per_worker
dataset.end = min(dataset.start + num_per_worker, end) ... dataset.end = min(dataset.start + num_per_worker, end)
...
dataloader = DataLoader( >>> dataloader = DataLoader(
dataset, ... dataset,
num_workers=2, ... num_workers=2,
batch_size=1, ... batch_size=1,
drop_last=True, ... drop_last=True,
worker_init_fn=worker_init_fn) ... worker_init_fn=worker_init_fn)
...
for data in dataloader: >>> for data in dataloader:
print(data) ... print(data) # doctest: +SKIP("The output depends on the environment.")
# outputs: [2, 5, 3, 6, 4, 7] Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[2]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[3]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[4]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[5]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[6]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[7]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[8]])
""" """
...@@ -249,22 +276,21 @@ class TensorDataset(Dataset): ...@@ -249,22 +276,21 @@ class TensorDataset(Dataset):
.. code-block:: python .. code-block:: python
import numpy as np >>> import numpy as np
import paddle >>> import paddle
from paddle.io import TensorDataset >>> from paddle.io import TensorDataset
input_np = np.random.random([2, 3, 4]).astype('float32') >>> input_np = np.random.random([2, 3, 4]).astype('float32')
input = paddle.to_tensor(input_np) >>> input = paddle.to_tensor(input_np)
label_np = np.random.random([2, 1]).astype('int32') >>> label_np = np.random.random([2, 1]).astype('int32')
label = paddle.to_tensor(label_np) >>> label = paddle.to_tensor(label_np)
dataset = TensorDataset([input, label]) >>> dataset = TensorDataset([input, label])
for i in range(len(dataset)):
input, label = dataset[i]
print(input, label)
>>> for i in range(len(dataset)):
... input, label = dataset[i]
... # do something
""" """
def __init__(self, tensors): def __init__(self, tensors):
...@@ -309,32 +335,28 @@ class ComposeDataset(Dataset): ...@@ -309,32 +335,28 @@ class ComposeDataset(Dataset):
.. code-block:: python .. code-block:: python
import numpy as np >>> import numpy as np
import paddle >>> import paddle
from paddle.io import Dataset, ComposeDataset >>> from paddle.io import Dataset, ComposeDataset
# define a random dataset >>> # define a random dataset
class RandomDataset(Dataset): >>> class RandomDataset(Dataset):
def __init__(self, num_samples): ... def __init__(self, num_samples):
self.num_samples = num_samples ... self.num_samples = num_samples
...
def __getitem__(self, idx): ... def __getitem__(self, idx):
image = np.random.random([32]).astype('float32') ... image = np.random.random([32]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64') ... label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label ... return image, label
...
def __len__(self): ... def __len__(self):
return self.num_samples ... return self.num_samples
...
dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)]) >>> dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)])
for i in range(len(dataset)): >>> for i in range(len(dataset)):
image1, label1, image2, label2 = dataset[i] ... image1, label1, image2, label2 = dataset[i]
print(image1) ... # do something
print(label1)
print(image2)
print(label2)
""" """
def __init__(self, datasets): def __init__(self, datasets):
...@@ -379,25 +401,26 @@ class ChainDataset(IterableDataset): ...@@ -379,25 +401,26 @@ class ChainDataset(IterableDataset):
.. code-block:: python .. code-block:: python
import numpy as np >>> import numpy as np
import paddle >>> import paddle
from paddle.io import IterableDataset, ChainDataset >>> from paddle.io import IterableDataset, ChainDataset
# define a random dataset >>> # define a random dataset
class RandomDataset(IterableDataset): >>> class RandomDataset(IterableDataset):
def __init__(self, num_samples): ... def __init__(self, num_samples):
self.num_samples = num_samples ... self.num_samples = num_samples
...
def __iter__(self): ... def __iter__(self):
for i in range(10): ... for i in range(10):
image = np.random.random([32]).astype('float32') ... image = np.random.random([32]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64') ... label = np.random.randint(0, 9, (1, )).astype('int64')
yield image, label ... yield image, label
...
dataset = ChainDataset([RandomDataset(10), RandomDataset(10)]) >>> dataset = ChainDataset([RandomDataset(10), RandomDataset(10)])
for image, label in iter(dataset): >>> for image, label in iter(dataset):
print(image, label) ... # do something
... ...
""" """
...@@ -430,18 +453,18 @@ class Subset(Dataset): ...@@ -430,18 +453,18 @@ class Subset(Dataset):
.. code-block:: python .. code-block:: python
import paddle >>> import paddle
from paddle.io import Subset >>> from paddle.io import Subset
# example 1: >>> # example 1:
a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2]) >>> a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2])
print(list(a)) >>> print(list(a))
# [1, 3] [1, 3]
# example 2: >>> # example 2:
b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1]) >>> b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1])
print(list(b)) >>> print(list(b))
# [2, 2] [2, 2]
""" """
def __init__(self, dataset, indices): def __init__(self, dataset, indices):
...@@ -472,31 +495,30 @@ def random_split(dataset, lengths, generator=None): ...@@ -472,31 +495,30 @@ def random_split(dataset, lengths, generator=None):
.. code-block:: python .. code-block:: python
import paddle >>> import paddle
from paddle.io import random_split
>>> paddle.seed(2023)
a_list = paddle.io.random_split(range(10), [3, 7]) >>> a_list = paddle.io.random_split(range(10), [3, 7])
print(len(a_list)) >>> print(len(a_list))
# 2 2
for idx, v in enumerate(a_list[0]): >>> # output of the first subset
print(idx, v) >>> for idx, v in enumerate(a_list[0]):
... print(idx, v) # doctest: +SKIP("The output depends on the environment.")
# output of the first subset 0 7
# 0 1 1 6
# 1 3 2 5
# 2 9
>>> # output of the second subset
for idx, v in enumerate(a_list[1]): >>> for idx, v in enumerate(a_list[1]):
print(idx, v) ... print(idx, v) # doctest: +SKIP("The output depends on the environment.")
# output of the second subset 0 1
# 0 5 1 9
# 1 7 2 4
# 2 8 3 2
# 3 6 4 0
# 4 0 5 3
# 5 2 6 8
# 6 4
""" """
# Cannot verify that dataset is Sized # Cannot verify that dataset is Sized
if sum(lengths) != len(dataset): # type: ignore if sum(lengths) != len(dataset): # type: ignore
...@@ -528,8 +550,12 @@ def _accumulate(iterable, fn=lambda x, y: x + y): ...@@ -528,8 +550,12 @@ def _accumulate(iterable, fn=lambda x, y: x + y):
.. code-block:: python .. code-block:: python
_accumulate([1,2,3,4,5]) --> 1 3 6 10 15 >>> list(_accumulate([1, 2, 3, 4, 5]))
_accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120 [1, 3, 6, 10, 15]
>>> import operator
>>> list(_accumulate([1, 2, 3, 4, 5], operator.mul))
[1, 2, 6, 24, 120]
""" """
it = iter(iterable) it = iter(iterable)
......
...@@ -44,34 +44,39 @@ class Sampler: ...@@ -44,34 +44,39 @@ class Sampler:
.. code-block:: python .. code-block:: python
from paddle.io import Dataset, Sampler >>> from paddle.io import Dataset, Sampler
class RandomDataset(Dataset): >>> class RandomDataset(Dataset):
def __init__(self, num_samples): ... def __init__(self, num_samples):
self.num_samples = num_samples ... self.num_samples = num_samples
...
def __getitem__(self, idx): ... def __getitem__(self, idx):
image = np.random.random([784]).astype('float32') ... image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64') ... label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label ... return image, label
...
def __len__(self): ... def __len__(self):
return self.num_samples ... return self.num_samples
...
class MySampler(Sampler): >>> class MySampler(Sampler):
def __init__(self, data_source): ... def __init__(self, data_source):
self.data_source = data_source ... self.data_source = data_source
...
def __iter__(self): ... def __iter__(self):
return iter(range(len(self.data_source))) ... return iter(range(len(self.data_source)))
...
def __len__(self): ... def __len__(self):
return len(self.data_source) ... return len(self.data_source)
...
sampler = MySampler(data_source=RandomDataset(100)) >>> sampler = MySampler(data_source=RandomDataset(100))
for index in sampler: >>> for index in sampler:
print(index) ... print(index)
0
1
2
...
99
see `paddle.io.BatchSampler` see `paddle.io.BatchSampler`
see `paddle.io.DataLoader` see `paddle.io.DataLoader`
...@@ -105,24 +110,29 @@ class SequenceSampler(Sampler): ...@@ -105,24 +110,29 @@ class SequenceSampler(Sampler):
.. code-block:: python .. code-block:: python
from paddle.io import Dataset, SequenceSampler >>> from paddle.io import Dataset, SequenceSampler
class RandomDataset(Dataset): >>> class RandomDataset(Dataset):
def __init__(self, num_samples): ... def __init__(self, num_samples):
self.num_samples = num_samples ... self.num_samples = num_samples
...
def __getitem__(self, idx): ... def __getitem__(self, idx):
image = np.random.random([784]).astype('float32') ... image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64') ... label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label ... return image, label
...
def __len__(self): ... def __len__(self):
return self.num_samples ... return self.num_samples
...
sampler = SequenceSampler(data_source=RandomDataset(100)) >>> sampler = SequenceSampler(data_source=RandomDataset(100))
for index in sampler: >>> for index in sampler:
print(index) ... print(index)
0
1
2
...
99
see `paddle.io.Sampler` see `paddle.io.Sampler`
""" """
...@@ -160,25 +170,31 @@ class RandomSampler(Sampler): ...@@ -160,25 +170,31 @@ class RandomSampler(Sampler):
.. code-block:: python .. code-block:: python
from paddle.io import Dataset, RandomSampler >>> import numpy as np
>>> from paddle.io import Dataset, RandomSampler
class RandomDataset(Dataset):
def __init__(self, num_samples): >>> np.random.seed(2023)
self.num_samples = num_samples >>> class RandomDataset(Dataset):
... def __init__(self, num_samples):
def __getitem__(self, idx): ... self.num_samples = num_samples
image = np.random.random([784]).astype('float32') ...
label = np.random.randint(0, 9, (1, )).astype('int64') ... def __getitem__(self, idx):
return image, label ... image = np.random.random([784]).astype('float32')
... label = np.random.randint(0, 9, (1, )).astype('int64')
def __len__(self): ... return image, label
return self.num_samples ...
... def __len__(self):
sampler = RandomSampler(data_source=RandomDataset(100)) ... return self.num_samples
...
for index in sampler: >>> sampler = RandomSampler(data_source=RandomDataset(100))
print(index)
>>> for index in sampler:
... print(index)
56
12
68
...
87
""" """
def __init__( def __init__(
...@@ -288,14 +304,22 @@ class WeightedRandomSampler(Sampler): ...@@ -288,14 +304,22 @@ class WeightedRandomSampler(Sampler):
.. code-block:: python .. code-block:: python
from paddle.io import WeightedRandomSampler >>> import numpy as np
>>> from paddle.io import WeightedRandomSampler
sampler = WeightedRandomSampler(weights=[0.1, 0.3, 0.5, 0.7, 0.2],
num_samples=5, >>> np.random.seed(2023)
replacement=True) >>> sampler = WeightedRandomSampler(
... weights=[0.1, 0.3, 0.5, 0.7, 0.2],
for index in sampler: ... num_samples=5,
print(index) ... replacement=True
... )
>>> for index in sampler:
... print(index)
2
4
3
1
1
""" """
def __init__(self, weights, num_samples, replacement=True): def __init__(self, weights, num_samples, replacement=True):
......
...@@ -13,8 +13,6 @@ ...@@ -13,8 +13,6 @@
# limitations under the License. # limitations under the License.
import os import os
# NOTE: queue has a different name in python2 and python3
import queue import queue
import sys import sys
import traceback import traceback
...@@ -94,51 +92,64 @@ def get_worker_info(): ...@@ -94,51 +92,64 @@ def get_worker_info():
Returns: Returns:
WorkerInfo: an instance of WorkerInfo which contains fields above. WorkerInfo: an instance of WorkerInfo which contains fields above.
.. note:: Notes:
For more usage and examples, please see :code:`paddle.io.IterableDataset` For more usage and examples, please see :code:`paddle.io.IterableDataset`
Example: Example:
.. code-block:: python .. code-block:: python
import math >>> import math
import paddle >>> import paddle
import numpy as np >>> import numpy as np
from paddle.io import IterableDataset, DataLoader, get_worker_info >>> from paddle.io import IterableDataset, DataLoader, get_worker_info
class SplitedIterableDataset(IterableDataset): >>> class SplitedIterableDataset(IterableDataset):
def __init__(self, start, end): ... def __init__(self, start, end):
self.start = start ... self.start = start
self.end = end ... self.end = end
...
def __iter__(self): ... def __iter__(self):
worker_info = get_worker_info() ... worker_info = get_worker_info()
if worker_info is None: ... if worker_info is None:
iter_start = self.start ... iter_start = self.start
iter_end = self.end ... iter_end = self.end
else: ... else:
per_worker = int( ... per_worker = int(
math.ceil((self.end - self.start) / float( ... math.ceil((self.end - self.start) / float(
worker_info.num_workers))) ... worker_info.num_workers)))
worker_id = worker_info.id ... worker_id = worker_info.id
iter_start = self.start + worker_id * per_worker ... iter_start = self.start + worker_id * per_worker
iter_end = min(iter_start + per_worker, self.end) ... iter_end = min(iter_start + per_worker, self.end)
...
for i in range(iter_start, iter_end): ... for i in range(iter_start, iter_end):
yield np.array([i]) ... yield np.array([i])
...
place = paddle.CPUPlace() >>> place = paddle.CPUPlace()
dataset = SplitedIterableDataset(start=2, end=9) >>> dataset = SplitedIterableDataset(start=2, end=9)
dataloader = DataLoader( >>> dataloader = DataLoader(
dataset, ... dataset,
places=place, ... places=place,
num_workers=2, ... num_workers=2,
batch_size=1, ... batch_size=1,
drop_last=True) ... drop_last=True)
...
for data in dataloader: >>> for data in dataloader:
print(data) ... print(data) # doctest: +SKIP("The output depends on the environment.")
# outputs: [2, 5, 3, 6, 4, 7] Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[2]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[6]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[3]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[7]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[4]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[8]])
Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
[[5]])
""" """
return _worker_info return _worker_info
......
...@@ -13,8 +13,6 @@ ...@@ -13,8 +13,6 @@
# limitations under the License. # limitations under the License.
import atexit import atexit
# NOTE: queue has a different name in python2 and python3
import queue import queue
import signal import signal
import sys import sys
......
...@@ -14,8 +14,6 @@ ...@@ -14,8 +14,6 @@
import copy import copy
import multiprocessing import multiprocessing
# NOTE: queue has a different name in python2 and python3
import sys import sys
import time import time
import warnings import warnings
...@@ -234,7 +232,7 @@ class DataLoader: ...@@ -234,7 +232,7 @@ class DataLoader:
For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler` For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler`
.. note:: Notes:
GPU tensor operation is not supported in subprocess currently, GPU tensor operation is not supported in subprocess currently,
please don't use GPU tensor operations in pipeline which will please don't use GPU tensor operations in pipeline which will
be performed in subprocess, such as dataset transforms, collte_fn, be performed in subprocess, such as dataset transforms, collte_fn,
...@@ -250,7 +248,7 @@ class DataLoader: ...@@ -250,7 +248,7 @@ class DataLoader:
:attr:`collate_fn` or :attr:`default_collate_fn`. :attr:`collate_fn` or :attr:`default_collate_fn`.
.. note:: Notes:
When automatic batching is disabled, :attr:`default_collate_fn` will When automatic batching is disabled, :attr:`default_collate_fn` will
do nothing to data from dataset. do nothing to data from dataset.
...@@ -321,68 +319,66 @@ class DataLoader: ...@@ -321,68 +319,66 @@ class DataLoader:
.. code-block:: python .. code-block:: python
import numpy as np >>> import numpy as np
import paddle >>> import paddle
import paddle.nn as nn >>> import paddle.nn as nn
import paddle.nn.functional as F >>> import paddle.nn.functional as F
from paddle.io import Dataset, BatchSampler, DataLoader >>> from paddle.io import Dataset, BatchSampler, DataLoader
BATCH_NUM = 20 >>> BATCH_NUM = 20
BATCH_SIZE = 16 >>> BATCH_SIZE = 16
EPOCH_NUM = 4 >>> EPOCH_NUM = 4
IMAGE_SIZE = 784 >>> IMAGE_SIZE = 784
CLASS_NUM = 10 >>> CLASS_NUM = 10
# define a random dataset >>> # define a random dataset
class RandomDataset(Dataset): >>> class RandomDataset(Dataset):
def __init__(self, num_samples): ... def __init__(self, num_samples):
self.num_samples = num_samples ... self.num_samples = num_samples
...
def __getitem__(self, idx): ... def __getitem__(self, idx):
image = np.random.random([IMAGE_SIZE]).astype('float32') ... image = np.random.random([IMAGE_SIZE]).astype('float32')
label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64') ... label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
return image, label ... return image, label
...
def __len__(self): ... def __len__(self):
return self.num_samples ... return self.num_samples
...
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) >>> dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
class SimpleNet(nn.Layer): >>> class SimpleNet(nn.Layer):
def __init__(self): ... def __init__(self):
super().__init__() ... super().__init__()
self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM) ... self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)
...
def forward(self, image, label=None): ... def forward(self, image, label=None):
return self.fc(image) ... return self.fc(image)
...
simple_net = SimpleNet() >>> simple_net = SimpleNet()
opt = paddle.optimizer.SGD(learning_rate=1e-3, >>> opt = paddle.optimizer.SGD(learning_rate=1e-3,
parameters=simple_net.parameters()) ... parameters=simple_net.parameters())
...
loader = DataLoader(dataset, >>> loader = DataLoader(dataset,
batch_size=BATCH_SIZE, ... batch_size=BATCH_SIZE,
shuffle=True, ... shuffle=True,
drop_last=True, ... drop_last=True,
num_workers=2) ... num_workers=2)
...
for e in range(EPOCH_NUM): >>> for e in range(EPOCH_NUM):
for i, (image, label) in enumerate(loader()): ... for i, (image, label) in enumerate(loader()):
out = simple_net(image) ... out = simple_net(image)
loss = F.cross_entropy(out, label) ... loss = F.cross_entropy(out, label)
avg_loss = paddle.mean(loss) ... avg_loss = paddle.mean(loss)
avg_loss.backward() ... avg_loss.backward()
opt.minimize(avg_loss) ... opt.minimize(avg_loss)
simple_net.clear_gradients() ... simple_net.clear_gradients()
print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy()))) ... print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
Notes:
.. note::
For reading iterable dataset with multiprocess Dataloader, For reading iterable dataset with multiprocess Dataloader,
please see :code:`paddle.io.IterableDataset` please see :code:`paddle.io.IterableDataset`
""" """
def __init__( def __init__(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册