test_multiprocess_dataloader_dataset.py 14.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
16

17 18 19 20
import numpy as np

import paddle
import paddle.fluid as fluid
21
from paddle.io import (
22 23 24
    ChainDataset,
    ComposeDataset,
    DataLoader,
25 26 27 28
    Dataset,
    IterableDataset,
    TensorDataset,
)
29

30 31 32 33 34 35 36 37 38 39 40 41 42
IMAGE_SIZE = 32


class RandomDataset(Dataset):
    def __init__(self, sample_num):
        self.sample_num = sample_num

    def __len__(self):
        return self.sample_num

    def __getitem__(self, idx):
        np.random.seed(idx)
        image = np.random.random([IMAGE_SIZE]).astype('float32')
43
        label = np.random.randint(0, 9, (1,)).astype('int64')
44 45 46 47 48 49 50 51 52 53 54
        return image, label


class RandomIterableDataset(IterableDataset):
    def __init__(self, sample_num):
        self.sample_num = sample_num

    def __iter__(self):
        for i in range(self.sample_num):
            np.random.seed(i)
            image = np.random.random([IMAGE_SIZE]).astype('float32')
55
            label = np.random.randint(0, 9, (1,)).astype('int64')
56 57
            yield image, label

58 59 60

class TestTensorDataset(unittest.TestCase):
    def run_main(self, num_workers, places):
61 62 63
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1
        place = paddle.CPUPlace()
64 65
        with fluid.dygraph.guard(place):
            input_np = np.random.random([16, 3, 4]).astype('float32')
66
            input = paddle.to_tensor(input_np)
67
            label_np = np.random.random([16, 1]).astype('int32')
68
            label = paddle.to_tensor(label_np)
69 70 71

            dataset = TensorDataset([input, label])
            assert len(dataset) == 16
72 73 74 75 76 77 78
            dataloader = DataLoader(
                dataset,
                places=place,
                num_workers=num_workers,
                batch_size=1,
                drop_last=True,
            )
79 80 81 82 83 84

            for i, (input, label) in enumerate(dataloader()):
                assert len(input) == 1
                assert len(label) == 1
                assert input.shape == [1, 3, 4]
                assert label.shape == [1, 1]
85 86 87 88 89 90
                assert isinstance(
                    input, (fluid.core.VarBase, fluid.core.eager.Tensor)
                )
                assert isinstance(
                    label, (fluid.core.VarBase, fluid.core.eager.Tensor)
                )
91 92 93
                assert np.allclose(input.numpy(), input_np[i])
                assert np.allclose(label.numpy(), label_np[i])

94
    def test_main(self):
95 96 97
        places = [paddle.CPUPlace()]
        if paddle.is_compiled_with_cuda():
            places.append(paddle.CUDAPlace(0))
98
        for p in places:
99 100 101 102
            self.run_main(num_workers=0, places=p)


class TestComposeDataset(unittest.TestCase):
103
    def test_main(self):
104 105
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121

        dataset1 = RandomDataset(10)
        dataset2 = RandomDataset(10)
        dataset = ComposeDataset([dataset1, dataset2])
        assert len(dataset) == 10

        for i in range(len(dataset)):
            input1, label1, input2, label2 = dataset[i]
            input1_t, label1_t = dataset1[i]
            input2_t, label2_t = dataset2[i]
            assert np.allclose(input1, input1_t)
            assert np.allclose(label1, label1_t)
            assert np.allclose(input2, input2_t)
            assert np.allclose(label2, label2_t)


122
class TestRandomSplitApi(unittest.TestCase):
123
    def test_main(self):
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1

        dataset1, dataset2 = paddle.io.random_split(range(5), [1, 4])

        self.assertTrue(len(dataset1) == 1)
        self.assertTrue(len(dataset2) == 4)

        elements_list = list(range(5))

        for _, val in enumerate(dataset1):
            elements_list.remove(val)

        for _, val in enumerate(dataset2):
            elements_list.remove(val)

        self.assertTrue(len(elements_list) == 0)


class TestRandomSplitError(unittest.TestCase):
144
    def test_errors(self):
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1

        self.assertRaises(ValueError, paddle.io.random_split, range(5), [3, 8])
        self.assertRaises(ValueError, paddle.io.random_split, range(5), [8])
        self.assertRaises(ValueError, paddle.io.random_split, range(5), [])


class TestSubsetDataset(unittest.TestCase):
    def run_main(self, num_workers, places):
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1

        input_np = np.random.random([5, 3, 4]).astype('float32')
        input = paddle.to_tensor(input_np)
        label_np = np.random.random([5, 1]).astype('int32')
        label = paddle.to_tensor(label_np)

        dataset = TensorDataset([input, label])
        even_subset = paddle.io.Subset(dataset, [0, 2, 4])
        odd_subset = paddle.io.Subset(dataset, [1, 3])

        assert len(dataset) == 5

        def prepare_dataloader(dataset):
170 171 172 173 174 175 176
            return DataLoader(
                dataset,
                places=places,
                num_workers=num_workers,
                batch_size=1,
                drop_last=True,
            )
177 178 179 180 181 182 183 184 185 186

        dataloader = prepare_dataloader(dataset)
        dataloader_even = prepare_dataloader(even_subset)
        dataloader_odd = prepare_dataloader(odd_subset)

        def assert_basic(input, label):
            assert len(input) == 1
            assert len(label) == 1
            assert input.shape == [1, 3, 4]
            assert label.shape == [1, 1]
187 188 189 190 191 192
            assert isinstance(
                input, (fluid.core.VarBase, fluid.core.eager.Tensor)
            )
            assert isinstance(
                label, (fluid.core.VarBase, fluid.core.eager.Tensor)
            )
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209

        elements_list = list()
        for _, (input, label) in enumerate(dataloader()):
            assert_basic(input, label)
            elements_list.append(label)

        for _, (input, label) in enumerate(dataloader_even()):
            assert_basic(input, label)
            elements_list.remove(label)

        odd_list = list()
        for _, (input, label) in enumerate(dataloader_odd()):
            assert_basic(input, label)
            odd_list.append(label)

        self.assertEqual(odd_list, elements_list)

210
    def test_main(self):
211 212 213 214 215 216 217 218 219 220
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1

        places = [paddle.CPUPlace()]
        if paddle.is_compiled_with_cuda():
            places.append(paddle.CUDAPlace(0))
        for p in places:
            self.run_main(num_workers=0, places=p)


221 222
class TestChainDataset(unittest.TestCase):
    def run_main(self, num_workers, places):
223 224
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244

        dataset1 = RandomIterableDataset(10)
        dataset2 = RandomIterableDataset(10)
        dataset = ChainDataset([dataset1, dataset2])

        samples = []
        for data in iter(dataset):
            samples.append(data)
        assert len(samples) == 20

        idx = 0
        for image, label in iter(dataset1):
            assert np.allclose(image, samples[idx][0])
            assert np.allclose(label, samples[idx][1])
            idx += 1
        for image, label in iter(dataset2):
            assert np.allclose(image, samples[idx][0])
            assert np.allclose(label, samples[idx][1])
            idx += 1

245
    def test_main(self):
246 247 248
        places = [paddle.CPUPlace()]
        if paddle.is_compiled_with_cuda():
            places.append(paddle.CUDAPlace(0))
249
        for p in places:
250
            self.run_main(num_workers=0, places=p)
251 252


253 254 255 256 257 258 259 260 261 262
class NumpyMixTensorDataset(Dataset):
    def __init__(self, sample_num):
        self.sample_num = sample_num

    def __len__(self):
        return self.sample_num

    def __getitem__(self, idx):
        np.random.seed(idx)
        image = np.random.random([IMAGE_SIZE]).astype('float32')
263
        label = np.random.randint(0, 9, (1,)).astype('int64')
264 265 266 267 268 269 270 271 272 273 274
        return paddle.to_tensor(image, place=paddle.CPUPlace()), label


class TestNumpyMixTensorDataset(TestTensorDataset):
    def run_main(self, num_workers, places):
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1
        place = paddle.CPUPlace()
        with fluid.dygraph.guard(place):
            dataset = NumpyMixTensorDataset(16)
            assert len(dataset) == 16
275 276 277 278 279 280 281
            dataloader = DataLoader(
                dataset,
                places=place,
                num_workers=num_workers,
                batch_size=1,
                drop_last=True,
            )
282 283 284 285 286 287

            for i, (input, label) in enumerate(dataloader()):
                assert len(input) == 1
                assert len(label) == 1
                assert input.shape == [1, IMAGE_SIZE]
                assert label.shape == [1, 1]
288 289 290 291 292 293
                assert isinstance(
                    input, (fluid.core.VarBase, fluid.core.eager.Tensor)
                )
                assert isinstance(
                    label, (fluid.core.VarBase, fluid.core.eager.Tensor)
                )
294 295


296 297 298 299 300 301 302 303
class ComplextDataset(Dataset):
    def __init__(self, sample_num):
        self.sample_num = sample_num

    def __len__(self):
        return self.sample_num

    def __getitem__(self, idx):
304 305 306 307 308 309 310 311 312 313
        return (
            3.1,
            'abc',
            paddle.to_tensor(
                np.random.random([IMAGE_SIZE]).astype('float32'),
                place=paddle.CPUPlace(),
            ),
            [1, np.random.random([2]).astype('float32')],
            {'a': 2.0, 'b': np.random.random([2]).astype('float32')},
        )
314 315 316 317 318 319 320 321 322 323


class TestComplextDataset(unittest.TestCase):
    def run_main(self, num_workers):
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1
        place = paddle.CPUPlace()
        with fluid.dygraph.guard(place):
            dataset = ComplextDataset(16)
            assert len(dataset) == 16
324 325 326 327 328 329 330
            dataloader = DataLoader(
                dataset,
                places=place,
                num_workers=num_workers,
                batch_size=2,
                drop_last=True,
            )
331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351

            for i, data in enumerate(dataloader()):
                assert len(data) == 5
                # data[0]: collate 3.1
                assert data[0].shape == [2]
                assert isinstance(data[1], list)
                # data[1]: collate 'abc'
                assert len(data[1]) == 2
                assert isinstance(data[1][0], str)
                assert isinstance(data[1][1], str)
                # data[2]: collate tensor
                assert data[2].shape == [2, IMAGE_SIZE]
                # data[3]: collate list
                assert isinstance(data[3], list)
                assert data[3][0].shape == [2]
                assert data[3][1].shape == [2, 2]
                # data[4]: collate dict
                assert isinstance(data[4], dict)
                assert data[4]['a'].shape == [2]
                assert data[4]['b'].shape == [2, 2]

352
    def test_main(self):
353 354 355 356
        for num_workers in [0, 2]:
            self.run_main(num_workers)


357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
class SingleFieldDataset(Dataset):
    def __init__(self, sample_num):
        self.sample_num = sample_num

    def __len__(self):
        return self.sample_num

    def __getitem__(self, idx):
        return np.random.random((2, 3)).astype('float32')


class TestSingleFieldDataset(unittest.TestCase):
    def init_dataset(self):
        self.sample_num = 16
        self.dataset = SingleFieldDataset(self.sample_num)

    def run_main(self, num_workers):
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1
        place = paddle.CPUPlace()
        with fluid.dygraph.guard(place):
            self.init_dataset()
379 380 381 382 383 384 385
            dataloader = DataLoader(
                self.dataset,
                places=place,
                num_workers=num_workers,
                batch_size=2,
                drop_last=True,
            )
386 387

            for i, data in enumerate(dataloader()):
388 389 390
                assert isinstance(
                    data, (fluid.core.VarBase, fluid.core.eager.Tensor)
                )
391 392
                assert data.shape == [2, 2, 3]

393
    def test_main(self):
394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412
        for num_workers in [0, 2]:
            self.run_main(num_workers)


class SingleFieldIterableDataset(IterableDataset):
    def __init__(self, sample_num):
        self.sample_num = sample_num

    def __iter__(self):
        for _ in range(self.sample_num):
            yield np.random.random((2, 3)).astype('float32')


class TestSingleFieldIterableDataset(TestSingleFieldDataset):
    def init_dataset(self):
        self.sample_num = 16
        self.dataset = SingleFieldIterableDataset(self.sample_num)


413 414 415
class TestDataLoaderGenerateStates(unittest.TestCase):
    def setUp(self):
        self.inputs = [(0, 1), (0, 2), (1, 3)]
416 417 418 419 420
        self.outputs = [
            [1835504127, 1731038949, 1320224556, 2330041505],
            [2834126987, 2358157858, 1860244682, 1437227251],
            [457190280, 2660306227, 859341110, 354512857],
        ]
421

422
    def test_main(self):
423
        from paddle.fluid.dataloader.worker import _generate_states
424

425 426 427 428 429
        for inp, outp in zip(self.inputs, self.outputs):
            out = _generate_states(*inp)
            assert out == outp


430 431 432 433
class TestDatasetWithDropLast(unittest.TestCase):
    def run_main(self, dataset, num_samples, batch_size):
        for num_workers in [0, 1]:
            for drop_last in [True, False]:
434 435 436 437 438 439 440 441 442
                steps = (
                    num_samples + (1 - int(drop_last)) * (batch_size - 1)
                ) // batch_size
                dataloader = DataLoader(
                    dataset,
                    batch_size=batch_size,
                    drop_last=drop_last,
                    num_workers=num_workers,
                )
443 444 445 446 447
                datas = []
                for data in dataloader:
                    datas.append(data)
                assert len(datas) == steps

448
    def test_map_dataset(self):
449 450 451
        dataset = RandomDataset(10)
        self.run_main(dataset, 10, 3)

452
    def test_iterable_dataset(self):
453 454 455 456
        dataset = RandomIterableDataset(10)
        self.run_main(dataset, 10, 3)


457 458
if __name__ == '__main__':
    unittest.main()