test_multiprocess_dataloader_dataset.py 16.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
16

17 18 19 20
import numpy as np

import paddle
import paddle.fluid as fluid
21
from paddle.fluid.framework import _test_eager_guard
22
from paddle.io import (
23 24 25
    ChainDataset,
    ComposeDataset,
    DataLoader,
26 27 28 29
    Dataset,
    IterableDataset,
    TensorDataset,
)
30

31 32 33 34 35 36 37 38 39 40 41 42 43
IMAGE_SIZE = 32


class RandomDataset(Dataset):
    def __init__(self, sample_num):
        self.sample_num = sample_num

    def __len__(self):
        return self.sample_num

    def __getitem__(self, idx):
        np.random.seed(idx)
        image = np.random.random([IMAGE_SIZE]).astype('float32')
44
        label = np.random.randint(0, 9, (1,)).astype('int64')
45 46 47 48 49 50 51 52 53 54 55
        return image, label


class RandomIterableDataset(IterableDataset):
    def __init__(self, sample_num):
        self.sample_num = sample_num

    def __iter__(self):
        for i in range(self.sample_num):
            np.random.seed(i)
            image = np.random.random([IMAGE_SIZE]).astype('float32')
56
            label = np.random.randint(0, 9, (1,)).astype('int64')
57 58
            yield image, label

59 60 61

class TestTensorDataset(unittest.TestCase):
    def run_main(self, num_workers, places):
62 63 64
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1
        place = paddle.CPUPlace()
65 66
        with fluid.dygraph.guard(place):
            input_np = np.random.random([16, 3, 4]).astype('float32')
67
            input = paddle.to_tensor(input_np)
68
            label_np = np.random.random([16, 1]).astype('int32')
69
            label = paddle.to_tensor(label_np)
70 71 72

            dataset = TensorDataset([input, label])
            assert len(dataset) == 16
73 74 75 76 77 78 79
            dataloader = DataLoader(
                dataset,
                places=place,
                num_workers=num_workers,
                batch_size=1,
                drop_last=True,
            )
80 81 82 83 84 85

            for i, (input, label) in enumerate(dataloader()):
                assert len(input) == 1
                assert len(label) == 1
                assert input.shape == [1, 3, 4]
                assert label.shape == [1, 1]
86 87 88 89 90 91
                assert isinstance(
                    input, (fluid.core.VarBase, fluid.core.eager.Tensor)
                )
                assert isinstance(
                    label, (fluid.core.VarBase, fluid.core.eager.Tensor)
                )
92 93 94
                assert np.allclose(input.numpy(), input_np[i])
                assert np.allclose(label.numpy(), label_np[i])

W
wanghuancoder 已提交
95
    def func_test_main(self):
96 97 98
        places = [paddle.CPUPlace()]
        if paddle.is_compiled_with_cuda():
            places.append(paddle.CUDAPlace(0))
99
        for p in places:
100 101
            self.run_main(num_workers=0, places=p)

W
wanghuancoder 已提交
102 103 104 105 106
    def test_main(self):
        with _test_eager_guard():
            self.func_test_main()
        self.func_test_main()

107 108

class TestComposeDataset(unittest.TestCase):
W
wanghuancoder 已提交
109
    def func_test_main(self):
110 111
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126

        dataset1 = RandomDataset(10)
        dataset2 = RandomDataset(10)
        dataset = ComposeDataset([dataset1, dataset2])
        assert len(dataset) == 10

        for i in range(len(dataset)):
            input1, label1, input2, label2 = dataset[i]
            input1_t, label1_t = dataset1[i]
            input2_t, label2_t = dataset2[i]
            assert np.allclose(input1, input1_t)
            assert np.allclose(label1, label1_t)
            assert np.allclose(input2, input2_t)
            assert np.allclose(label2, label2_t)

W
wanghuancoder 已提交
127 128 129 130 131
    def test_main(self):
        with _test_eager_guard():
            self.func_test_main()
        self.func_test_main()

132

133
class TestRandomSplitApi(unittest.TestCase):
W
wanghuancoder 已提交
134
    def func_test_main(self):
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1

        dataset1, dataset2 = paddle.io.random_split(range(5), [1, 4])

        self.assertTrue(len(dataset1) == 1)
        self.assertTrue(len(dataset2) == 4)

        elements_list = list(range(5))

        for _, val in enumerate(dataset1):
            elements_list.remove(val)

        for _, val in enumerate(dataset2):
            elements_list.remove(val)

        self.assertTrue(len(elements_list) == 0)

W
wanghuancoder 已提交
153 154 155 156 157
    def test_main(self):
        with _test_eager_guard():
            self.func_test_main()
        self.func_test_main()

158 159

class TestRandomSplitError(unittest.TestCase):
W
wanghuancoder 已提交
160
    def func_test_errors(self):
161 162 163 164 165 166 167
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1

        self.assertRaises(ValueError, paddle.io.random_split, range(5), [3, 8])
        self.assertRaises(ValueError, paddle.io.random_split, range(5), [8])
        self.assertRaises(ValueError, paddle.io.random_split, range(5), [])

W
wanghuancoder 已提交
168 169 170 171 172
    def test_errors(self):
        with _test_eager_guard():
            self.func_test_errors()
        self.func_test_errors()

173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190

class TestSubsetDataset(unittest.TestCase):
    def run_main(self, num_workers, places):
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1

        input_np = np.random.random([5, 3, 4]).astype('float32')
        input = paddle.to_tensor(input_np)
        label_np = np.random.random([5, 1]).astype('int32')
        label = paddle.to_tensor(label_np)

        dataset = TensorDataset([input, label])
        even_subset = paddle.io.Subset(dataset, [0, 2, 4])
        odd_subset = paddle.io.Subset(dataset, [1, 3])

        assert len(dataset) == 5

        def prepare_dataloader(dataset):
191 192 193 194 195 196 197
            return DataLoader(
                dataset,
                places=places,
                num_workers=num_workers,
                batch_size=1,
                drop_last=True,
            )
198 199 200 201 202 203 204 205 206 207

        dataloader = prepare_dataloader(dataset)
        dataloader_even = prepare_dataloader(even_subset)
        dataloader_odd = prepare_dataloader(odd_subset)

        def assert_basic(input, label):
            assert len(input) == 1
            assert len(label) == 1
            assert input.shape == [1, 3, 4]
            assert label.shape == [1, 1]
208 209 210 211 212 213
            assert isinstance(
                input, (fluid.core.VarBase, fluid.core.eager.Tensor)
            )
            assert isinstance(
                label, (fluid.core.VarBase, fluid.core.eager.Tensor)
            )
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230

        elements_list = list()
        for _, (input, label) in enumerate(dataloader()):
            assert_basic(input, label)
            elements_list.append(label)

        for _, (input, label) in enumerate(dataloader_even()):
            assert_basic(input, label)
            elements_list.remove(label)

        odd_list = list()
        for _, (input, label) in enumerate(dataloader_odd()):
            assert_basic(input, label)
            odd_list.append(label)

        self.assertEqual(odd_list, elements_list)

W
wanghuancoder 已提交
231
    def func_test_main(self):
232 233 234 235 236 237 238 239 240
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1

        places = [paddle.CPUPlace()]
        if paddle.is_compiled_with_cuda():
            places.append(paddle.CUDAPlace(0))
        for p in places:
            self.run_main(num_workers=0, places=p)

W
wanghuancoder 已提交
241 242 243 244 245
    def test_main(self):
        with _test_eager_guard():
            self.func_test_main()
        self.func_test_main()

246

247 248
class TestChainDataset(unittest.TestCase):
    def run_main(self, num_workers, places):
249 250
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270

        dataset1 = RandomIterableDataset(10)
        dataset2 = RandomIterableDataset(10)
        dataset = ChainDataset([dataset1, dataset2])

        samples = []
        for data in iter(dataset):
            samples.append(data)
        assert len(samples) == 20

        idx = 0
        for image, label in iter(dataset1):
            assert np.allclose(image, samples[idx][0])
            assert np.allclose(label, samples[idx][1])
            idx += 1
        for image, label in iter(dataset2):
            assert np.allclose(image, samples[idx][0])
            assert np.allclose(label, samples[idx][1])
            idx += 1

W
wanghuancoder 已提交
271
    def func_test_main(self):
272 273 274
        places = [paddle.CPUPlace()]
        if paddle.is_compiled_with_cuda():
            places.append(paddle.CUDAPlace(0))
275
        for p in places:
276
            self.run_main(num_workers=0, places=p)
277

W
wanghuancoder 已提交
278 279 280 281 282
    def test_main(self):
        with _test_eager_guard():
            self.func_test_main()
        self.func_test_main()

283

284 285 286 287 288 289 290 291 292 293
class NumpyMixTensorDataset(Dataset):
    def __init__(self, sample_num):
        self.sample_num = sample_num

    def __len__(self):
        return self.sample_num

    def __getitem__(self, idx):
        np.random.seed(idx)
        image = np.random.random([IMAGE_SIZE]).astype('float32')
294
        label = np.random.randint(0, 9, (1,)).astype('int64')
295 296 297 298 299 300 301 302 303 304 305
        return paddle.to_tensor(image, place=paddle.CPUPlace()), label


class TestNumpyMixTensorDataset(TestTensorDataset):
    def run_main(self, num_workers, places):
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1
        place = paddle.CPUPlace()
        with fluid.dygraph.guard(place):
            dataset = NumpyMixTensorDataset(16)
            assert len(dataset) == 16
306 307 308 309 310 311 312
            dataloader = DataLoader(
                dataset,
                places=place,
                num_workers=num_workers,
                batch_size=1,
                drop_last=True,
            )
313 314 315 316 317 318

            for i, (input, label) in enumerate(dataloader()):
                assert len(input) == 1
                assert len(label) == 1
                assert input.shape == [1, IMAGE_SIZE]
                assert label.shape == [1, 1]
319 320 321 322 323 324
                assert isinstance(
                    input, (fluid.core.VarBase, fluid.core.eager.Tensor)
                )
                assert isinstance(
                    label, (fluid.core.VarBase, fluid.core.eager.Tensor)
                )
325 326


327 328 329 330 331 332 333 334
class ComplextDataset(Dataset):
    def __init__(self, sample_num):
        self.sample_num = sample_num

    def __len__(self):
        return self.sample_num

    def __getitem__(self, idx):
335 336 337 338 339 340 341 342 343 344
        return (
            3.1,
            'abc',
            paddle.to_tensor(
                np.random.random([IMAGE_SIZE]).astype('float32'),
                place=paddle.CPUPlace(),
            ),
            [1, np.random.random([2]).astype('float32')],
            {'a': 2.0, 'b': np.random.random([2]).astype('float32')},
        )
345 346 347 348 349 350 351 352 353 354


class TestComplextDataset(unittest.TestCase):
    def run_main(self, num_workers):
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1
        place = paddle.CPUPlace()
        with fluid.dygraph.guard(place):
            dataset = ComplextDataset(16)
            assert len(dataset) == 16
355 356 357 358 359 360 361
            dataloader = DataLoader(
                dataset,
                places=place,
                num_workers=num_workers,
                batch_size=2,
                drop_last=True,
            )
362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382

            for i, data in enumerate(dataloader()):
                assert len(data) == 5
                # data[0]: collate 3.1
                assert data[0].shape == [2]
                assert isinstance(data[1], list)
                # data[1]: collate 'abc'
                assert len(data[1]) == 2
                assert isinstance(data[1][0], str)
                assert isinstance(data[1][1], str)
                # data[2]: collate tensor
                assert data[2].shape == [2, IMAGE_SIZE]
                # data[3]: collate list
                assert isinstance(data[3], list)
                assert data[3][0].shape == [2]
                assert data[3][1].shape == [2, 2]
                # data[4]: collate dict
                assert isinstance(data[4], dict)
                assert data[4]['a'].shape == [2]
                assert data[4]['b'].shape == [2, 2]

W
wanghuancoder 已提交
383
    def func_test_main(self):
384 385 386
        for num_workers in [0, 2]:
            self.run_main(num_workers)

W
wanghuancoder 已提交
387 388 389 390 391
    def test_main(self):
        with _test_eager_guard():
            self.func_test_main()
        self.func_test_main()

392

393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
class SingleFieldDataset(Dataset):
    def __init__(self, sample_num):
        self.sample_num = sample_num

    def __len__(self):
        return self.sample_num

    def __getitem__(self, idx):
        return np.random.random((2, 3)).astype('float32')


class TestSingleFieldDataset(unittest.TestCase):
    def init_dataset(self):
        self.sample_num = 16
        self.dataset = SingleFieldDataset(self.sample_num)

    def run_main(self, num_workers):
        paddle.static.default_startup_program().random_seed = 1
        paddle.static.default_main_program().random_seed = 1
        place = paddle.CPUPlace()
        with fluid.dygraph.guard(place):
            self.init_dataset()
415 416 417 418 419 420 421
            dataloader = DataLoader(
                self.dataset,
                places=place,
                num_workers=num_workers,
                batch_size=2,
                drop_last=True,
            )
422 423

            for i, data in enumerate(dataloader()):
424 425 426
                assert isinstance(
                    data, (fluid.core.VarBase, fluid.core.eager.Tensor)
                )
427 428
                assert data.shape == [2, 2, 3]

W
wanghuancoder 已提交
429
    def func_test_main(self):
430 431 432
        for num_workers in [0, 2]:
            self.run_main(num_workers)

W
wanghuancoder 已提交
433 434 435 436 437
    def test_main(self):
        with _test_eager_guard():
            self.func_test_main()
        self.func_test_main()

438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453

class SingleFieldIterableDataset(IterableDataset):
    def __init__(self, sample_num):
        self.sample_num = sample_num

    def __iter__(self):
        for _ in range(self.sample_num):
            yield np.random.random((2, 3)).astype('float32')


class TestSingleFieldIterableDataset(TestSingleFieldDataset):
    def init_dataset(self):
        self.sample_num = 16
        self.dataset = SingleFieldIterableDataset(self.sample_num)


454 455 456
class TestDataLoaderGenerateStates(unittest.TestCase):
    def setUp(self):
        self.inputs = [(0, 1), (0, 2), (1, 3)]
457 458 459 460 461
        self.outputs = [
            [1835504127, 1731038949, 1320224556, 2330041505],
            [2834126987, 2358157858, 1860244682, 1437227251],
            [457190280, 2660306227, 859341110, 354512857],
        ]
462

W
wanghuancoder 已提交
463
    def func_test_main(self):
464
        from paddle.fluid.dataloader.worker import _generate_states
465

466 467 468 469
        for inp, outp in zip(self.inputs, self.outputs):
            out = _generate_states(*inp)
            assert out == outp

W
wanghuancoder 已提交
470 471 472 473 474
    def test_main(self):
        with _test_eager_guard():
            self.func_test_main()
        self.func_test_main()

475

476 477 478 479
class TestDatasetWithDropLast(unittest.TestCase):
    def run_main(self, dataset, num_samples, batch_size):
        for num_workers in [0, 1]:
            for drop_last in [True, False]:
480 481 482 483 484 485 486 487 488
                steps = (
                    num_samples + (1 - int(drop_last)) * (batch_size - 1)
                ) // batch_size
                dataloader = DataLoader(
                    dataset,
                    batch_size=batch_size,
                    drop_last=drop_last,
                    num_workers=num_workers,
                )
489 490 491 492 493
                datas = []
                for data in dataloader:
                    datas.append(data)
                assert len(datas) == steps

W
wanghuancoder 已提交
494
    def func_test_map_dataset(self):
495 496 497
        dataset = RandomDataset(10)
        self.run_main(dataset, 10, 3)

W
wanghuancoder 已提交
498 499 500 501 502 503
    def test_map_dataset(self):
        with _test_eager_guard():
            self.func_test_map_dataset()
        self.func_test_map_dataset()

    def func_test_iterable_dataset(self):
504 505 506
        dataset = RandomIterableDataset(10)
        self.run_main(dataset, 10, 3)

W
wanghuancoder 已提交
507 508 509 510 511
    def test_iterable_dataset(self):
        with _test_eager_guard():
            self.func_test_iterable_dataset()
        self.func_test_iterable_dataset()

512

513 514
if __name__ == '__main__':
    unittest.main()