From 50d3117d30d3b495e21c062c542c715d6dea4398 Mon Sep 17 00:00:00 2001 From: joejiong Date: Wed, 9 Dec 2020 20:31:06 +0800 Subject: [PATCH] Add random_split and Subset dataset (#29291) As the title --- python/paddle/fluid/dataloader/dataset.py | 130 +++++++++++++++++- .../test_multiprocess_dataloader_dataset.py | 127 ++++++++++++++--- python/paddle/io/__init__.py | 4 +- 3 files changed, 242 insertions(+), 19 deletions(-) mode change 100644 => 100755 python/paddle/fluid/dataloader/dataset.py mode change 100644 => 100755 python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py mode change 100644 => 100755 python/paddle/io/__init__.py diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py old mode 100644 new mode 100755 index 7ae77fe501b..ac90cbafe17 --- a/python/paddle/fluid/dataloader/dataset.py +++ b/python/paddle/fluid/dataloader/dataset.py @@ -19,7 +19,7 @@ import paddle.dataset.common __all__ = [ "Dataset", "IterableDataset", "TensorDataset", "ComposeDataset", - "ChainDataset" + "ChainDataset", "random_split", "Subset" ] @@ -400,3 +400,131 @@ class ChainDataset(IterableDataset): for dataset in self.datasets: for sample in dataset: yield sample + + +class Subset(Dataset): + """ + Subset of a dataset at specified indices. + + Args: + dataset (Dataset): The whole Dataset. + indices (sequence): Indices in the whole set selected for subset. + + Returns: + Dataset: A Dataset which is the subset of the original dataset. + + Example code: + + .. code-block:: python + + import paddle + from paddle.io import Subset + + # example 1: + a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2]) + print(list(a)) + # [1, 3] + + # example 2: + b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1]) + print(list(b)) + # [2, 2] + """ + + def __init__(self, dataset, indices): + self.dataset = dataset + self.indices = indices + + def __getitem__(self, idx): + return self.dataset[self.indices[idx]] + + def __len__(self): + return len(self.indices) + + +def random_split(dataset, lengths, generator=None): + """ + Randomly split a dataset into non-overlapping new datasets of given lengths. + Optionally fix the generator for reproducible results, e.g.: + + Args: + dataset (Dataset): Dataset to be split + lengths (sequence): lengths of splits to be produced + generator (Generator, optional): Generator used for the random permutation. Default is None then the DefaultGenerator is used in manual_seed(). + + Returns: + Datasets: A list of subset Datasets, which are the non-overlapping subsets of the original Dataset. + + Example code: + + .. code-block:: python + + import paddle + from paddle.io import random_split + + a_list = paddle.io.random_split(range(10), [3, 7]) + print(len(a_list)) + # 2 + + for idx, v in enumerate(a_list[0]): + print(idx, v) + + # output of the first subset + # 0 1 + # 1 3 + # 2 9 + + for idx, v in enumerate(a_list[1]): + print(idx, v) + # output of the second subset + # 0 5 + # 1 7 + # 2 8 + # 3 6 + # 4 0 + # 5 2 + # 6 4 + """ + # Cannot verify that dataset is Sized + if sum(lengths) != len(dataset): # type: ignore + raise ValueError( + "Sum of input lengths does not equal the length of the input dataset!" + ) + # TODO(@Joejiong): support Variable or Tensor type with .tolist class member function. + # For example var.item() and var.tolist() + indices = paddle.randperm(sum(lengths)).numpy().tolist() + return [ + Subset(dataset, indices[offset - length:offset]) + for offset, length in zip(_accumulate(lengths), lengths) + ] + + +def _accumulate(iterable, fn=lambda x, y: x + y): + """ + Return running totals + + Args: + iterable: any iterable object for example dataset. + y (x): one element in the iterable object. + fn (x, y): Defaults to lambdax. + + Yields: + yields total from beginning iterator to current iterator. + + Example code: + + .. code-block:: python + + _accumulate([1,2,3,4,5]) --> 1 3 6 10 15 + _accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120 + """ + + it = iter(iterable) + try: + total = next(it) + except StopIteration: + return + yield total + for element in it: + total = fn(total, element) + yield total diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py old mode 100644 new mode 100755 index 4ff9b73421a..0f7b0ace67a --- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py +++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py @@ -20,8 +20,7 @@ import numpy as np import paddle import paddle.fluid as fluid from paddle.io import Dataset, IterableDataset, TensorDataset, \ - ComposeDataset, ChainDataset, DataLoader -from paddle.fluid.dygraph.base import to_variable + ComposeDataset, ChainDataset, DataLoader, random_split, Subset IMAGE_SIZE = 32 @@ -54,14 +53,14 @@ class RandomIterableDataset(IterableDataset): class TestTensorDataset(unittest.TestCase): def run_main(self, num_workers, places): - fluid.default_startup_program().random_seed = 1 - fluid.default_main_program().random_seed = 1 - place = fluid.CPUPlace() + paddle.static.default_startup_program().random_seed = 1 + paddle.static.default_main_program().random_seed = 1 + place = paddle.CPUPlace() with fluid.dygraph.guard(place): input_np = np.random.random([16, 3, 4]).astype('float32') - input = to_variable(input_np) + input = paddle.to_tensor(input_np) label_np = np.random.random([16, 1]).astype('int32') - label = to_variable(label_np) + label = paddle.to_tensor(label_np) dataset = TensorDataset([input, label]) assert len(dataset) == 16 @@ -83,17 +82,17 @@ class TestTensorDataset(unittest.TestCase): assert np.allclose(label.numpy(), label_np[i]) def test_main(self): - places = [fluid.CPUPlace()] - if fluid.core.is_compiled_with_cuda(): - places.append(fluid.CUDAPlace(0)) + places = [paddle.CPUPlace()] + if paddle.is_compiled_with_cuda(): + places.append(paddle.CUDAPlace(0)) for p in places: self.run_main(num_workers=0, places=p) class TestComposeDataset(unittest.TestCase): def test_main(self): - fluid.default_startup_program().random_seed = 1 - fluid.default_main_program().random_seed = 1 + paddle.static.default_startup_program().random_seed = 1 + paddle.static.default_main_program().random_seed = 1 dataset1 = RandomDataset(10) dataset2 = RandomDataset(10) @@ -110,10 +109,104 @@ class TestComposeDataset(unittest.TestCase): assert np.allclose(label2, label2_t) +class TestRandomSplitApi(unittest.TestCase): + def test_main(self): + paddle.static.default_startup_program().random_seed = 1 + paddle.static.default_main_program().random_seed = 1 + + dataset1, dataset2 = paddle.io.random_split(range(5), [1, 4]) + + self.assertTrue(len(dataset1) == 1) + self.assertTrue(len(dataset2) == 4) + + elements_list = list(range(5)) + + for _, val in enumerate(dataset1): + elements_list.remove(val) + + for _, val in enumerate(dataset2): + elements_list.remove(val) + + self.assertTrue(len(elements_list) == 0) + + +class TestRandomSplitError(unittest.TestCase): + def test_errors(self): + paddle.static.default_startup_program().random_seed = 1 + paddle.static.default_main_program().random_seed = 1 + + self.assertRaises(ValueError, paddle.io.random_split, range(5), [3, 8]) + self.assertRaises(ValueError, paddle.io.random_split, range(5), [8]) + self.assertRaises(ValueError, paddle.io.random_split, range(5), []) + + +class TestSubsetDataset(unittest.TestCase): + def run_main(self, num_workers, places): + paddle.static.default_startup_program().random_seed = 1 + paddle.static.default_main_program().random_seed = 1 + + input_np = np.random.random([5, 3, 4]).astype('float32') + input = paddle.to_tensor(input_np) + label_np = np.random.random([5, 1]).astype('int32') + label = paddle.to_tensor(label_np) + + dataset = TensorDataset([input, label]) + even_subset = paddle.io.Subset(dataset, [0, 2, 4]) + odd_subset = paddle.io.Subset(dataset, [1, 3]) + + assert len(dataset) == 5 + + def prepare_dataloader(dataset): + return DataLoader( + dataset, + places=places, + num_workers=num_workers, + batch_size=1, + drop_last=True) + + dataloader = prepare_dataloader(dataset) + dataloader_even = prepare_dataloader(even_subset) + dataloader_odd = prepare_dataloader(odd_subset) + + def assert_basic(input, label): + assert len(input) == 1 + assert len(label) == 1 + assert input.shape == [1, 3, 4] + assert label.shape == [1, 1] + assert isinstance(input, paddle.Tensor) + assert isinstance(label, paddle.Tensor) + + elements_list = list() + for _, (input, label) in enumerate(dataloader()): + assert_basic(input, label) + elements_list.append(label) + + for _, (input, label) in enumerate(dataloader_even()): + assert_basic(input, label) + elements_list.remove(label) + + odd_list = list() + for _, (input, label) in enumerate(dataloader_odd()): + assert_basic(input, label) + odd_list.append(label) + + self.assertEqual(odd_list, elements_list) + + def test_main(self): + paddle.static.default_startup_program().random_seed = 1 + paddle.static.default_main_program().random_seed = 1 + + places = [paddle.CPUPlace()] + if paddle.is_compiled_with_cuda(): + places.append(paddle.CUDAPlace(0)) + for p in places: + self.run_main(num_workers=0, places=p) + + class TestChainDataset(unittest.TestCase): def run_main(self, num_workers, places): - fluid.default_startup_program().random_seed = 1 - fluid.default_main_program().random_seed = 1 + paddle.static.default_startup_program().random_seed = 1 + paddle.static.default_main_program().random_seed = 1 dataset1 = RandomIterableDataset(10) dataset2 = RandomIterableDataset(10) @@ -135,9 +228,9 @@ class TestChainDataset(unittest.TestCase): idx += 1 def test_main(self): - places = [fluid.CPUPlace()] - if fluid.core.is_compiled_with_cuda(): - places.append(fluid.CUDAPlace(0)) + places = [paddle.CPUPlace()] + if paddle.is_compiled_with_cuda(): + places.append(paddle.CUDAPlace(0)) for p in places: self.run_main(num_workers=0, places=p) diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py old mode 100644 new mode 100755 index e8b07528019..59e2729941e --- a/python/paddle/io/__init__.py +++ b/python/paddle/io/__init__.py @@ -28,9 +28,11 @@ __all__ = [ 'SequenceSampler', 'RandomSampler', 'WeightedRandomSampler', + 'random_split', + 'Subset' ] from ..fluid.io import DataLoader from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \ TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler, \ - ComposeDataset, ChainDataset, WeightedRandomSampler + ComposeDataset, ChainDataset, WeightedRandomSampler, Subset, random_split -- GitLab