From 9cd09a858627ebb7f4755ac782e5d36b7bc1740c Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 24 Nov 2020 10:04:55 +0800 Subject: [PATCH] Polish dataloader doc detail & update example (#28975) * polish dataloader doc detail, test=decument_fix * fix commnet error * fix word error --- python/paddle/fluid/reader.py | 173 ++++++++++++++++++++++++---------- 1 file changed, 124 insertions(+), 49 deletions(-) diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index ac924580e17..09850b3cac9 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -182,8 +182,8 @@ class DataLoader(object): dataset(Dataset): the dataset to load data from, should be an instance of subclass of :code:`paddle.io.Dataset` or :code:`paddle.io.IterableDataset`. - feed_list (list(Tensor)|tuple(Tensor)): feed variable list. - The variables should be created by :code:`paddle.static.data()`. + feed_list (list(Tensor)|tuple(Tensor)): feed Tensor list. + The Tensors should be created by :code:`paddle.static.data()`. :attr:`feed_list` must be set if :attr:`return_list` is False. Default None. places(list(Place)|tuple(Place)|optional): a list of Place, @@ -193,7 +193,7 @@ class DataLoader(object): return_list (bool): whether the return value on each device is presented as a list. If :attr:`return_list=False`, the return value on each device would be a dict of str -> Tensor, where - the key of the dict is the name of each fed variables. If + the key of the dict is the name of each fed Tensors. If :attr:`return_list=True`, the return value on each device would be a list(Tensor). :attr:`return_list` can only be True in dynamic graph mode. Default True. @@ -447,14 +447,11 @@ class DataLoader(object): If iterable = False, the created DataLoader object provides :code:`start()` and :code:`reset()` method to control the data reading - process. This mode is designed to be compatible with the - :code:`fluid.layers.py_reader` interface. Users can migrate the codes - from :code:`fluid.layers.py_reader` to :code:`fluid.io.DataLoader` - easily when using iterable=False. + process. Args: - feed_list (list(Variable)|tuple(Variable)): feed variable list. - The variables should be created by :code:`fluid.data()`. + feed_list (list(Tensor)|tuple(Tensor)): feed Tensor list. + The Tensors should be created by :code:`fluid.data()`. capacity (int): capacity of the queue maintained in DataLoader. The unit is batch number. Set larger capacity if your reader is fast. @@ -468,7 +465,7 @@ class DataLoader(object): presented as a list. It is only valid when iterable=True. If return_list=False, the return value on each device would be a dict of str -> LoDTensor, where the key of the dict is - the name of each fed variables. If return_list=True, the + the name of each fed Tensors. If return_list=True, the return value on each device would be a list(LoDTensor). It is recommended to use return_list=False in static graph mode and use return_list=True in dygraph mode. @@ -492,9 +489,16 @@ class DataLoader(object): .. code-block:: python - import paddle.fluid as fluid + ''' + Example in static graph mode + ''' import numpy as np + import paddle + import paddle.static as static + import paddle.nn.functional as F + + BATCH_NUM = 10 BATCH_SIZE = 16 EPOCH_NUM = 4 @@ -506,11 +510,13 @@ class DataLoader(object): DATA_FORMAT = 'batch_generator' # data format of data source user provides + paddle.enable_static() + def simple_net(image, label): - fc_tmp = fluid.layers.fc(image, size=CLASS_NUM) - cross_entropy = fluid.layers.softmax_with_cross_entropy(image, label) - loss = fluid.layers.reduce_mean(cross_entropy) - sgd = fluid.optimizer.SGD(learning_rate=1e-3) + fc_tmp = static.nn.fc(image, size=CLASS_NUM) + cross_entropy = F.softmax_with_cross_entropy(image, label) + loss = paddle.mean(cross_entropy) + sgd = paddle.optimizer.SGD(learning_rate=1e-3) sgd.minimize(loss) return loss @@ -566,7 +572,7 @@ class DataLoader(object): try: while True: exe.run(prog, fetch_list=[loss]) - except fluid.core.EOFException: + except paddle.core.EOFException: loader.reset() # call DataLoader.reset() after catching EOFException def set_data_source(loader, places): @@ -579,11 +585,11 @@ class DataLoader(object): else: raise ValueError('Unsupported data format') - image = fluid.data(name='image', shape=[None, 784], dtype='float32') - label = fluid.data(name='label', shape=[None, 1], dtype='int64') + image = static.data(name='image', shape=[None, 784], dtype='float32') + label = static.data(name='label', shape=[None, 1], dtype='int64') # Define DataLoader - loader = fluid.io.DataLoader.from_generator(feed_list=[image, label], capacity=16, iterable=ITERABLE) + loader = paddle.io.DataLoader.from_generator(feed_list=[image, label], capacity=16, iterable=ITERABLE) # Define network loss = simple_net(image, label) @@ -591,17 +597,17 @@ class DataLoader(object): # Set data source of DataLoader # # If DataLoader is iterable, places must be given and the number of places must be the same with device number. - # - If you are using GPU, call `fluid.cuda_places()` to get all GPU places. - # - If you are using CPU, call `fluid.cpu_places()` to get all CPU places. + # - If you are using GPU, call `paddle.static.cuda_places()` to get all GPU places. + # - If you are using CPU, call `paddle.static.cpu_places()` to get all CPU places. # # If DataLoader is not iterable, places can be None. - places = fluid.cuda_places() if USE_GPU else fluid.cpu_places() + places = static.cuda_places() if USE_GPU else static.cpu_places() set_data_source(loader, places) - exe = fluid.Executor(places[0]) - exe.run(fluid.default_startup_program()) + exe = static.Executor(places[0]) + exe.run(static.default_startup_program()) - prog = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name) + prog = static.CompiledProgram(static.default_main_program()).with_data_parallel(loss_name=loss.name) if loader.iterable: train_iterable(exe, prog, loss, loader) @@ -609,45 +615,110 @@ class DataLoader(object): train_non_iterable(exe, prog, loss, loader) + Examples 2: + + .. code-block:: python + ''' - Users can use return_list = True in dygraph mode. + Example in dynamic graph mode. ''' - with fluid.dygraph.guard(places[0]): - loader = fluid.io.DataLoader.from_generator(capacity=2, return_list=True) - set_data_source(loader, places[0]) - for image, label in loader(): - relu = fluid.layers.relu(image) - assert image.shape == [BATCH_SIZE, 784] - assert label.shape == [BATCH_SIZE, 1] - assert relu.shape == [BATCH_SIZE, 784] + import numpy as np - Examples 2: + import paddle + import paddle.nn as nn + import paddle.optimizer as opt + import paddle.distributed as dist + + BATCH_SIZE = 16 + BATCH_NUM = 4 + EPOCH_NUM = 4 + + IMAGE_SIZE = 784 + CLASS_NUM = 10 + + USE_GPU = False # whether to use GPU + + def _get_random_images_and_labels(image_shape, label_shape): + image = np.random.random(size=image_shape).astype('float32') + label = np.random.random(size=label_shape).astype('int64') + return image, label + + def __reader__(): + for _ in range(BATCH_NUM): + batch_image, batch_label = _get_random_images_and_labels( + [BATCH_SIZE, IMAGE_SIZE], [BATCH_SIZE, CLASS_NUM]) + yield batch_image, batch_label + + def random_batch_reader(): + return __reader__ + + class LinearNet(nn.Layer): + def __init__(self): + super(LinearNet, self).__init__() + self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM) + + @paddle.jit.to_static + def forward(self, x): + return self._linear(x) + + # set device + paddle.set_device('gpu' if USE_GPU else 'cpu') + + # create network + layer = LinearNet() + dp_layer = paddle.DataParallel(layer) + loss_fn = nn.CrossEntropyLoss() + adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) + + # create data loader + loader = paddle.io.DataLoader.from_generator(capacity=5) + loader.set_batch_generator(random_batch_reader()) + + for epoch_id in range(EPOCH_NUM): + for batch_id, (image, label) in enumerate(loader()): + out = layer(image) + loss = loss_fn(out, label) + + loss.backward() + + adam.step() + adam.clear_grad() + print("Epoch {} batch {}: loss = {}".format( + epoch_id, batch_id, np.mean(loss.numpy()))) + + Examples 3: .. code-block:: python - import paddle.fluid as fluid + ''' + Example of `drop_last` using in static graph multi-cards mode + ''' + import paddle + import paddle.static as static import numpy as np import os # We use 2 CPU cores to run inference network os.environ['CPU_NUM'] = '2' + paddle.enable_static() + # The data source has only 3 batches, which can not be # divided evenly to each CPU core def batch_generator(): for i in range(3): yield np.array([i+1]).astype('float32'), - x = fluid.data(name='x', shape=[None], dtype='float32') + x = static.data(name='x', shape=[None], dtype='float32') y = x * x def run_inference(drop_last): - loader = fluid.io.DataLoader.from_generator(feed_list=[x], + loader = paddle.io.DataLoader.from_generator(feed_list=[x], capacity=8, drop_last=drop_last) - loader.set_batch_generator(batch_generator, fluid.cpu_places()) + loader.set_batch_generator(batch_generator, static.cpu_places()) - exe = fluid.Executor(fluid.CPUPlace()) - prog = fluid.CompiledProgram(fluid.default_main_program()) + exe = static.Executor(paddle.CPUPlace()) + prog = static.CompiledProgram(static.default_main_program()) prog = prog.with_data_parallel() result = [] @@ -698,18 +769,22 @@ class DataLoader(object): .. code-block:: python - import paddle.fluid as fluid + import paddle + import paddle.static as static - image = fluid.data(name='image', shape=[None, 784], dtype='float32') - label = fluid.data(name='label', shape=[None, 1], dtype='int64') + paddle.enable_static() + + image = static.data(name='image', shape=[None, 784], dtype='float32') + label = static.data(name='label', shape=[None, 1], dtype='int64') - dataset = fluid.DatasetFactory().create_dataset("QueueDataset") - dataset.set_batch_size(32) + dataset = paddle.distributed.QueueDataset() + dataset.init( + batch_size=32, + pipe_command='cat', + use_var=[image, label]) dataset.set_filelist(['a.txt', 'b.txt', 'c.txt']) - dataset.set_use_var([image, label]) - dataset.set_pipe_command('cat') - loader = fluid.io.DataLoader.from_dataset(dataset, fluid.cpu_places()) + loader = paddle.io.DataLoader.from_dataset(dataset, static.cpu_places()) """ return DatasetLoader(dataset, places, drop_last) -- GitLab