diff --git a/python/paddle/io/dataloader/batch_sampler.py b/python/paddle/io/dataloader/batch_sampler.py index 190e9240900f8b44e169aa8198aaf55f342aa36b..78c93151a390dccaa3218e5310ebd6e1da09a9ac 100644 --- a/python/paddle/io/dataloader/batch_sampler.py +++ b/python/paddle/io/dataloader/batch_sampler.py @@ -58,40 +58,44 @@ class BatchSampler(Sampler): .. code-block:: python - from paddle.io import RandomSampler, BatchSampler, Dataset - - # init with dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - bs = BatchSampler(dataset=RandomDataset(100), - shuffle=False, - batch_size=16, - drop_last=False) - - for batch_indices in bs: - print(batch_indices) - - # init with sampler - sampler = RandomSampler(RandomDataset(100)) - bs = BatchSampler(sampler=sampler, - batch_size=8, - drop_last=True) - - for batch_indices in bs: - print(batch_indices) - - - + >>> import numpy as np + >>> from paddle.io import RandomSampler, BatchSampler, Dataset + + >>> np.random.seed(2023) + >>> # init with dataset + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> bs = BatchSampler(dataset=RandomDataset(100), + ... shuffle=False, + ... batch_size=16, + ... drop_last=False) + ... + >>> for batch_indices in bs: + ... print(batch_indices) + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ... + [96, 97, 98, 99] + >>> # init with sampler + >>> sampler = RandomSampler(RandomDataset(100)) + >>> bs = BatchSampler(sampler=sampler, + ... batch_size=8, + ... drop_last=True) + ... + >>> for batch_indices in bs: + ... print(batch_indices) + [56, 12, 68, 0, 82, 66, 91, 44] + ... + [53, 17, 22, 86, 52, 3, 92, 33] """ def __init__( @@ -203,29 +207,29 @@ class DistributedBatchSampler(BatchSampler): Examples: .. code-block:: python - import numpy as np - - from paddle.io import Dataset, DistributedBatchSampler - - # init with dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - dataset = RandomDataset(100) - sampler = DistributedBatchSampler(dataset, batch_size=64) - - for data in sampler: - # do something - break + >>> import numpy as np + + >>> from paddle.io import Dataset, DistributedBatchSampler + + >>> # init with dataset + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> dataset = RandomDataset(100) + >>> sampler = DistributedBatchSampler(dataset, batch_size=64) + + >>> for data in sampler: + ... # do something + ... break """ def __init__( @@ -339,27 +343,27 @@ class DistributedBatchSampler(BatchSampler): Examples: .. code-block:: python - import numpy as np - - from paddle.io import Dataset, DistributedBatchSampler - - # init with dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - dataset = RandomDataset(100) - sampler = DistributedBatchSampler(dataset, batch_size=64) - - for epoch in range(10): - sampler.set_epoch(epoch) + >>> import numpy as np + + >>> from paddle.io import Dataset, DistributedBatchSampler + + >>> # init with dataset + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> dataset = RandomDataset(100) + >>> sampler = DistributedBatchSampler(dataset, batch_size=64) + + >>> for epoch in range(10): + ... sampler.set_epoch(epoch) """ self.epoch = epoch diff --git a/python/paddle/io/dataloader/dataset.py b/python/paddle/io/dataloader/dataset.py index 3e0458ae9b70091f651f2b5e425f736d00e64c32..4daf410a318362ec5410e455894ddb11e467bde1 100755 --- a/python/paddle/io/dataloader/dataset.py +++ b/python/paddle/io/dataloader/dataset.py @@ -37,26 +37,26 @@ class Dataset: .. code-block:: python - import numpy as np - from paddle.io import Dataset - - # define a random dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - dataset = RandomDataset(10) - for i in range(len(dataset)): - print(dataset[i]) - + >>> import numpy as np + >>> from paddle.io import Dataset + + >>> # define a random dataset + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> dataset = RandomDataset(10) + >>> for i in range(len(dataset)): + ... image, label = dataset[i] + ... # do something """ def __init__(self): @@ -95,23 +95,24 @@ class IterableDataset(Dataset): .. code-block:: python :name: code-example1 - import numpy as np - from paddle.io import IterableDataset - - # define a random dataset - class RandomDataset(IterableDataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __iter__(self): - for i in range(self.num_samples): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - yield image, label - - dataset = RandomDataset(10) - for img, lbl in dataset: - print(img, lbl) + >>> import numpy as np + >>> from paddle.io import IterableDataset + + >>> # define a random dataset + >>> class RandomDataset(IterableDataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __iter__(self): + ... for i in range(self.num_samples): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... yield image, label + ... + >>> dataset = RandomDataset(10) + >>> for img, label in dataset: + ... # do something + ... ... When :attr:`num_workers > 0`, each worker has a different copy of the dataset object and will yield whole dataset samples, which means samples in dataset will be repeated in @@ -125,87 +126,113 @@ class IterableDataset(Dataset): .. code-block:: python :name: code-example2 - import math - import paddle - import numpy as np - from paddle.io import IterableDataset, DataLoader, get_worker_info - - class SplitedIterableDataset(IterableDataset): - def __init__(self, start, end): - self.start = start - self.end = end - - def __iter__(self): - worker_info = get_worker_info() - if worker_info is None: - iter_start = self.start - iter_end = self.end - else: - per_worker = int( - math.ceil((self.end - self.start) / float( - worker_info.num_workers))) - worker_id = worker_info.id - iter_start = self.start + worker_id * per_worker - iter_end = min(iter_start + per_worker, self.end) - - for i in range(iter_start, iter_end): - yield np.array([i]) - - dataset = SplitedIterableDataset(start=2, end=9) - dataloader = DataLoader( - dataset, - num_workers=2, - batch_size=1, - drop_last=True) - - for data in dataloader: - print(data) - # outputs: [2, 5, 3, 6, 4, 7] + >>> import math + >>> import paddle + >>> import numpy as np + >>> from paddle.io import IterableDataset, DataLoader, get_worker_info + + >>> class SplitedIterableDataset(IterableDataset): + ... def __init__(self, start, end): + ... self.start = start + ... self.end = end + ... + ... def __iter__(self): + ... worker_info = get_worker_info() + ... if worker_info is None: + ... iter_start = self.start + ... iter_end = self.end + ... else: + ... per_worker = int( + ... math.ceil((self.end - self.start) / float( + ... worker_info.num_workers))) + ... worker_id = worker_info.id + ... iter_start = self.start + worker_id * per_worker + ... iter_end = min(iter_start + per_worker, self.end) + ... + ... for i in range(iter_start, iter_end): + ... yield np.array([i]) + ... + >>> dataset = SplitedIterableDataset(start=2, end=9) + >>> dataloader = DataLoader( + ... dataset, + ... num_workers=2, + ... batch_size=1, + ... drop_last=True) + ... + >>> for data in dataloader: + ... print(data) # doctest: +SKIP("The output depends on the environment.") + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[2]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[3]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[4]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[5]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[6]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[7]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[8]]) splitting data copy in each worker by :code:`worker_init_fn` .. code-block:: python :name: code-example3 - import math - import paddle - import numpy as np - from paddle.io import IterableDataset, DataLoader, get_worker_info - - class RangeIterableDataset(IterableDataset): - def __init__(self, start, end): - self.start = start - self.end = end - - def __iter__(self): - for i in range(self.start, self.end): - yield np.array([i]) - - dataset = RangeIterableDataset(start=2, end=9) - - def worker_init_fn(worker_id): - worker_info = get_worker_info() - - dataset = worker_info.dataset - start = dataset.start - end = dataset.end - num_per_worker = int( - math.ceil((end - start) / float(worker_info.num_workers))) - - worker_id = worker_info.id - dataset.start = start + worker_id * num_per_worker - dataset.end = min(dataset.start + num_per_worker, end) - - dataloader = DataLoader( - dataset, - num_workers=2, - batch_size=1, - drop_last=True, - worker_init_fn=worker_init_fn) - - for data in dataloader: - print(data) - # outputs: [2, 5, 3, 6, 4, 7] + >>> import math + >>> import paddle + >>> import numpy as np + >>> from paddle.io import IterableDataset, DataLoader, get_worker_info + + >>> class RangeIterableDataset(IterableDataset): + ... def __init__(self, start, end): + ... self.start = start + ... self.end = end + ... + ... def __iter__(self): + ... for i in range(self.start, self.end): + ... yield np.array([i]) + ... + >>> dataset = RangeIterableDataset(start=2, end=9) + + >>> def worker_init_fn(worker_id): + ... worker_info = get_worker_info() + ... + ... dataset = worker_info.dataset + ... start = dataset.start + ... end = dataset.end + ... num_per_worker = int( + ... math.ceil((end - start) / float(worker_info.num_workers))) + ... + ... worker_id = worker_info.id + ... dataset.start = start + worker_id * num_per_worker + ... dataset.end = min(dataset.start + num_per_worker, end) + ... + >>> dataloader = DataLoader( + ... dataset, + ... num_workers=2, + ... batch_size=1, + ... drop_last=True, + ... worker_init_fn=worker_init_fn) + ... + >>> for data in dataloader: + ... print(data) # doctest: +SKIP("The output depends on the environment.") + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[2]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[3]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[4]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[5]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[6]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[7]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[8]]) """ @@ -249,22 +276,21 @@ class TensorDataset(Dataset): .. code-block:: python - import numpy as np - import paddle - from paddle.io import TensorDataset + >>> import numpy as np + >>> import paddle + >>> from paddle.io import TensorDataset - input_np = np.random.random([2, 3, 4]).astype('float32') - input = paddle.to_tensor(input_np) - label_np = np.random.random([2, 1]).astype('int32') - label = paddle.to_tensor(label_np) + >>> input_np = np.random.random([2, 3, 4]).astype('float32') + >>> input = paddle.to_tensor(input_np) + >>> label_np = np.random.random([2, 1]).astype('int32') + >>> label = paddle.to_tensor(label_np) - dataset = TensorDataset([input, label]) - - for i in range(len(dataset)): - input, label = dataset[i] - print(input, label) + >>> dataset = TensorDataset([input, label]) + >>> for i in range(len(dataset)): + ... input, label = dataset[i] + ... # do something """ def __init__(self, tensors): @@ -309,32 +335,28 @@ class ComposeDataset(Dataset): .. code-block:: python - import numpy as np - import paddle - from paddle.io import Dataset, ComposeDataset - - - # define a random dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([32]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)]) - for i in range(len(dataset)): - image1, label1, image2, label2 = dataset[i] - print(image1) - print(label1) - print(image2) - print(label2) - + >>> import numpy as np + >>> import paddle + >>> from paddle.io import Dataset, ComposeDataset + + + >>> # define a random dataset + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([32]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)]) + >>> for i in range(len(dataset)): + ... image1, label1, image2, label2 = dataset[i] + ... # do something """ def __init__(self, datasets): @@ -379,25 +401,26 @@ class ChainDataset(IterableDataset): .. code-block:: python - import numpy as np - import paddle - from paddle.io import IterableDataset, ChainDataset - - - # define a random dataset - class RandomDataset(IterableDataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __iter__(self): - for i in range(10): - image = np.random.random([32]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - yield image, label - - dataset = ChainDataset([RandomDataset(10), RandomDataset(10)]) - for image, label in iter(dataset): - print(image, label) + >>> import numpy as np + >>> import paddle + >>> from paddle.io import IterableDataset, ChainDataset + + + >>> # define a random dataset + >>> class RandomDataset(IterableDataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __iter__(self): + ... for i in range(10): + ... image = np.random.random([32]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... yield image, label + ... + >>> dataset = ChainDataset([RandomDataset(10), RandomDataset(10)]) + >>> for image, label in iter(dataset): + ... # do something + ... ... """ @@ -430,18 +453,18 @@ class Subset(Dataset): .. code-block:: python - import paddle - from paddle.io import Subset + >>> import paddle + >>> from paddle.io import Subset - # example 1: - a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2]) - print(list(a)) - # [1, 3] + >>> # example 1: + >>> a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2]) + >>> print(list(a)) + [1, 3] - # example 2: - b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1]) - print(list(b)) - # [2, 2] + >>> # example 2: + >>> b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1]) + >>> print(list(b)) + [2, 2] """ def __init__(self, dataset, indices): @@ -472,31 +495,30 @@ def random_split(dataset, lengths, generator=None): .. code-block:: python - import paddle - from paddle.io import random_split - - a_list = paddle.io.random_split(range(10), [3, 7]) - print(len(a_list)) - # 2 - - for idx, v in enumerate(a_list[0]): - print(idx, v) - - # output of the first subset - # 0 1 - # 1 3 - # 2 9 - - for idx, v in enumerate(a_list[1]): - print(idx, v) - # output of the second subset - # 0 5 - # 1 7 - # 2 8 - # 3 6 - # 4 0 - # 5 2 - # 6 4 + >>> import paddle + + >>> paddle.seed(2023) + >>> a_list = paddle.io.random_split(range(10), [3, 7]) + >>> print(len(a_list)) + 2 + + >>> # output of the first subset + >>> for idx, v in enumerate(a_list[0]): + ... print(idx, v) # doctest: +SKIP("The output depends on the environment.") + 0 7 + 1 6 + 2 5 + + >>> # output of the second subset + >>> for idx, v in enumerate(a_list[1]): + ... print(idx, v) # doctest: +SKIP("The output depends on the environment.") + 0 1 + 1 9 + 2 4 + 3 2 + 4 0 + 5 3 + 6 8 """ # Cannot verify that dataset is Sized if sum(lengths) != len(dataset): # type: ignore @@ -528,8 +550,12 @@ def _accumulate(iterable, fn=lambda x, y: x + y): .. code-block:: python - _accumulate([1,2,3,4,5]) --> 1 3 6 10 15 - _accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120 + >>> list(_accumulate([1, 2, 3, 4, 5])) + [1, 3, 6, 10, 15] + + >>> import operator + >>> list(_accumulate([1, 2, 3, 4, 5], operator.mul)) + [1, 2, 6, 24, 120] """ it = iter(iterable) diff --git a/python/paddle/io/dataloader/sampler.py b/python/paddle/io/dataloader/sampler.py index aa8a4e649c76cd6bb90c4ce41910c43c5b29a6a4..d26316ecc0eb7ac26a13ed1ed4f387a05b65c2f3 100644 --- a/python/paddle/io/dataloader/sampler.py +++ b/python/paddle/io/dataloader/sampler.py @@ -44,34 +44,39 @@ class Sampler: .. code-block:: python - from paddle.io import Dataset, Sampler - - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - class MySampler(Sampler): - def __init__(self, data_source): - self.data_source = data_source - - def __iter__(self): - return iter(range(len(self.data_source))) - - def __len__(self): - return len(self.data_source) - - sampler = MySampler(data_source=RandomDataset(100)) - - for index in sampler: - print(index) + >>> from paddle.io import Dataset, Sampler + + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> class MySampler(Sampler): + ... def __init__(self, data_source): + ... self.data_source = data_source + ... + ... def __iter__(self): + ... return iter(range(len(self.data_source))) + ... + ... def __len__(self): + ... return len(self.data_source) + ... + >>> sampler = MySampler(data_source=RandomDataset(100)) + + >>> for index in sampler: + ... print(index) + 0 + 1 + 2 + ... + 99 see `paddle.io.BatchSampler` see `paddle.io.DataLoader` @@ -105,24 +110,29 @@ class SequenceSampler(Sampler): .. code-block:: python - from paddle.io import Dataset, SequenceSampler - - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - sampler = SequenceSampler(data_source=RandomDataset(100)) - - for index in sampler: - print(index) + >>> from paddle.io import Dataset, SequenceSampler + + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> sampler = SequenceSampler(data_source=RandomDataset(100)) + + >>> for index in sampler: + ... print(index) + 0 + 1 + 2 + ... + 99 see `paddle.io.Sampler` """ @@ -160,25 +170,31 @@ class RandomSampler(Sampler): .. code-block:: python - from paddle.io import Dataset, RandomSampler - - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - sampler = RandomSampler(data_source=RandomDataset(100)) - - for index in sampler: - print(index) - + >>> import numpy as np + >>> from paddle.io import Dataset, RandomSampler + + >>> np.random.seed(2023) + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> sampler = RandomSampler(data_source=RandomDataset(100)) + + >>> for index in sampler: + ... print(index) + 56 + 12 + 68 + ... + 87 """ def __init__( @@ -288,14 +304,22 @@ class WeightedRandomSampler(Sampler): .. code-block:: python - from paddle.io import WeightedRandomSampler - - sampler = WeightedRandomSampler(weights=[0.1, 0.3, 0.5, 0.7, 0.2], - num_samples=5, - replacement=True) - - for index in sampler: - print(index) + >>> import numpy as np + >>> from paddle.io import WeightedRandomSampler + + >>> np.random.seed(2023) + >>> sampler = WeightedRandomSampler( + ... weights=[0.1, 0.3, 0.5, 0.7, 0.2], + ... num_samples=5, + ... replacement=True + ... ) + >>> for index in sampler: + ... print(index) + 2 + 4 + 3 + 1 + 1 """ def __init__(self, weights, num_samples, replacement=True): diff --git a/python/paddle/io/dataloader/worker.py b/python/paddle/io/dataloader/worker.py index 5eeeb849fc0251203b69a4597ca704c86e502670..4a1667483da6494fa0e0b6b0e93625d367da4e54 100644 --- a/python/paddle/io/dataloader/worker.py +++ b/python/paddle/io/dataloader/worker.py @@ -13,8 +13,6 @@ # limitations under the License. import os - -# NOTE: queue has a different name in python2 and python3 import queue import sys import traceback @@ -94,51 +92,64 @@ def get_worker_info(): Returns: WorkerInfo: an instance of WorkerInfo which contains fields above. - .. note:: + Notes: For more usage and examples, please see :code:`paddle.io.IterableDataset` Example: .. code-block:: python - import math - import paddle - import numpy as np - from paddle.io import IterableDataset, DataLoader, get_worker_info - - class SplitedIterableDataset(IterableDataset): - def __init__(self, start, end): - self.start = start - self.end = end - - def __iter__(self): - worker_info = get_worker_info() - if worker_info is None: - iter_start = self.start - iter_end = self.end - else: - per_worker = int( - math.ceil((self.end - self.start) / float( - worker_info.num_workers))) - worker_id = worker_info.id - iter_start = self.start + worker_id * per_worker - iter_end = min(iter_start + per_worker, self.end) - - for i in range(iter_start, iter_end): - yield np.array([i]) - - place = paddle.CPUPlace() - dataset = SplitedIterableDataset(start=2, end=9) - dataloader = DataLoader( - dataset, - places=place, - num_workers=2, - batch_size=1, - drop_last=True) - - for data in dataloader: - print(data) - # outputs: [2, 5, 3, 6, 4, 7] + >>> import math + >>> import paddle + >>> import numpy as np + >>> from paddle.io import IterableDataset, DataLoader, get_worker_info + + >>> class SplitedIterableDataset(IterableDataset): + ... def __init__(self, start, end): + ... self.start = start + ... self.end = end + ... + ... def __iter__(self): + ... worker_info = get_worker_info() + ... if worker_info is None: + ... iter_start = self.start + ... iter_end = self.end + ... else: + ... per_worker = int( + ... math.ceil((self.end - self.start) / float( + ... worker_info.num_workers))) + ... worker_id = worker_info.id + ... iter_start = self.start + worker_id * per_worker + ... iter_end = min(iter_start + per_worker, self.end) + ... + ... for i in range(iter_start, iter_end): + ... yield np.array([i]) + ... + >>> place = paddle.CPUPlace() + >>> dataset = SplitedIterableDataset(start=2, end=9) + >>> dataloader = DataLoader( + ... dataset, + ... places=place, + ... num_workers=2, + ... batch_size=1, + ... drop_last=True) + ... + >>> for data in dataloader: + ... print(data) # doctest: +SKIP("The output depends on the environment.") + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[2]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[6]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[3]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[7]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[4]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[8]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[5]]) """ return _worker_info diff --git a/python/paddle/io/multiprocess_utils.py b/python/paddle/io/multiprocess_utils.py index 51b0c2b81821427a8fa879017097c50563c19c42..c57b6dae86b5ed9f1663325e1cad01edb5e70b3f 100644 --- a/python/paddle/io/multiprocess_utils.py +++ b/python/paddle/io/multiprocess_utils.py @@ -13,8 +13,6 @@ # limitations under the License. import atexit - -# NOTE: queue has a different name in python2 and python3 import queue import signal import sys diff --git a/python/paddle/io/reader.py b/python/paddle/io/reader.py index 861d1253dcfb973f3a717d15ff5456314ea8f2f0..d8db6cc2ab012e9b92ee2b599b94b873123b1e98 100644 --- a/python/paddle/io/reader.py +++ b/python/paddle/io/reader.py @@ -14,8 +14,6 @@ import copy import multiprocessing - -# NOTE: queue has a different name in python2 and python3 import sys import time import warnings @@ -234,7 +232,7 @@ class DataLoader: For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler` - .. note:: + Notes: GPU tensor operation is not supported in subprocess currently, please don't use GPU tensor operations in pipeline which will be performed in subprocess, such as dataset transforms, collte_fn, @@ -250,7 +248,7 @@ class DataLoader: :attr:`collate_fn` or :attr:`default_collate_fn`. - .. note:: + Notes: When automatic batching is disabled, :attr:`default_collate_fn` will do nothing to data from dataset. @@ -321,68 +319,66 @@ class DataLoader: .. code-block:: python - import numpy as np - - import paddle - import paddle.nn as nn - import paddle.nn.functional as F - from paddle.io import Dataset, BatchSampler, DataLoader - - BATCH_NUM = 20 - BATCH_SIZE = 16 - EPOCH_NUM = 4 - - IMAGE_SIZE = 784 - CLASS_NUM = 10 - - # define a random dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([IMAGE_SIZE]).astype('float32') - label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) - - class SimpleNet(nn.Layer): - def __init__(self): - super().__init__() - self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM) - - def forward(self, image, label=None): - return self.fc(image) - - simple_net = SimpleNet() - opt = paddle.optimizer.SGD(learning_rate=1e-3, - parameters=simple_net.parameters()) - - loader = DataLoader(dataset, - batch_size=BATCH_SIZE, - shuffle=True, - drop_last=True, - num_workers=2) - - for e in range(EPOCH_NUM): - for i, (image, label) in enumerate(loader()): - out = simple_net(image) - loss = F.cross_entropy(out, label) - avg_loss = paddle.mean(loss) - avg_loss.backward() - opt.minimize(avg_loss) - simple_net.clear_gradients() - print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy()))) - - - .. note:: + >>> import numpy as np + + >>> import paddle + >>> import paddle.nn as nn + >>> import paddle.nn.functional as F + >>> from paddle.io import Dataset, BatchSampler, DataLoader + + >>> BATCH_NUM = 20 + >>> BATCH_SIZE = 16 + >>> EPOCH_NUM = 4 + + >>> IMAGE_SIZE = 784 + >>> CLASS_NUM = 10 + + >>> # define a random dataset + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([IMAGE_SIZE]).astype('float32') + ... label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) + + >>> class SimpleNet(nn.Layer): + ... def __init__(self): + ... super().__init__() + ... self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM) + ... + ... def forward(self, image, label=None): + ... return self.fc(image) + ... + >>> simple_net = SimpleNet() + >>> opt = paddle.optimizer.SGD(learning_rate=1e-3, + ... parameters=simple_net.parameters()) + ... + >>> loader = DataLoader(dataset, + ... batch_size=BATCH_SIZE, + ... shuffle=True, + ... drop_last=True, + ... num_workers=2) + ... + >>> for e in range(EPOCH_NUM): + ... for i, (image, label) in enumerate(loader()): + ... out = simple_net(image) + ... loss = F.cross_entropy(out, label) + ... avg_loss = paddle.mean(loss) + ... avg_loss.backward() + ... opt.minimize(avg_loss) + ... simple_net.clear_gradients() + ... print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy()))) + + Notes: For reading iterable dataset with multiprocess Dataloader, please see :code:`paddle.io.IterableDataset` - """ def __init__(