From 95b1868366887e8b84dd8636601c066b3ef0b2f8 Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Wed, 4 Nov 2020 12:07:15 +0800 Subject: [PATCH] update DataLoader doc (#28290) * update DataLoader doc. test=develop --- python/paddle/fluid/reader.py | 103 ++++++++++++---------------------- 1 file changed, 35 insertions(+), 68 deletions(-) diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 35dcd45223..0e7fd35f58 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -153,18 +153,22 @@ class DataLoader(object): multi-process workers will be used to load data asynchronously if :attr:`num_workers` is set as a positive number. - DataLoader only supports map-style dataset(can get a sample from - dataset with a given index) currently, for a map-style dataset, - please see :code:`paddle.io.Dataset`. + DataLoader supports map-style dataset and iterable-style dataset. - batch_sampler please see :code:`paddle.io.BatchSampler` + For map-style datast(can get a sample from dataset with a given + index), please see :code:`paddle.io.Dataset`. + + For iterable-style datast(get samples from dataset iteratively, + like a Python iterator), please see :code:`paddle.io.IterableDataset`. + + For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler` Args: dataset(Dataset): the dataset to load data from, should be an instance of subclass of :code:`paddle.io.Dataset` or :code:`paddle.io.IterableDataset`. feed_list (list(Tensor)|tuple(Tensor)): feed variable list. - The variables should be created by :code:`fluid.data()`. + The variables should be created by :code:`paddle.static.data()`. :attr:`feed_list` must be set if :attr:`return_list` is False. Default None. places(list(Place)|tuple(Place)|optional): a list of Place, @@ -173,10 +177,10 @@ class DataLoader(object): will be used. Default None. return_list (bool): whether the return value on each device is presented as a list. If :attr:`return_list=False`, the return - value on each device would be a dict of str -> LoDTensor, where + value on each device would be a dict of str -> Tensor, where the key of the dict is the name of each fed variables. If :attr:`return_list=True`, the return value on each device would - be a list(LoDTensor). :attr:`return_list` can only be True + be a list(Tensor). :attr:`return_list` can only be True in dynamic graph mode. Default False. batch_sampler(BatchSampler): an instance of `paddle.io.BatchSampler` to generate batch indices to draw samples from :attr:`dataset` @@ -224,7 +228,8 @@ class DataLoader(object): import numpy as np import paddle - import paddle.fluid as fluid + import paddle.nn as nn + import paddle.nn.functional as F from paddle.io import Dataset, BatchSampler, DataLoader BATCH_NUM = 20 @@ -234,8 +239,6 @@ class DataLoader(object): IMAGE_SIZE = 784 CLASS_NUM = 10 - USE_GPU = False # whether use GPU to run model - # define a random dataset class RandomDataset(Dataset): def __init__(self, num_samples): @@ -251,78 +254,34 @@ class DataLoader(object): dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) - # get places - places = fluid.cuda_places() if USE_GPU else fluid.cpu_places() - - # --------------------- dygraph mode -------------------- - - class SimpleNet(fluid.dygraph.Layer): + class SimpleNet(nn.Layer): def __init__(self): super(SimpleNet, self).__init__() - self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax') + self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM) def forward(self, image, label=None): return self.fc(image) - with fluid.dygraph.guard(places[0]): - simple_net = SimpleNet() - opt = fluid.optimizer.SGD(learning_rate=1e-3, - parameter_list=simple_net.parameters()) - - loader = DataLoader(dataset, - batch_size=BATCH_SIZE, - shuffle=True, - drop_last=True, - num_workers=2) - - for e in range(EPOCH_NUM): - for i, (image, label) in enumerate(loader()): - out = simple_net(image) - loss = fluid.layers.cross_entropy(out, label) - avg_loss = fluid.layers.reduce_mean(loss) - avg_loss.backward() - opt.minimize(avg_loss) - simple_net.clear_gradients() - print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy()))) - - # ------------------------------------------------------- - - # -------------------- static graph --------------------- - - paddle.enable_static() - - def simple_net(image, label): - fc_tmp = fluid.layers.fc(image, size=CLASS_NUM, act='softmax') - cross_entropy = fluid.layers.softmax_with_cross_entropy(image, label) - loss = fluid.layers.reduce_mean(cross_entropy) - sgd = fluid.optimizer.SGD(learning_rate=1e-3) - sgd.minimize(loss) - return loss - - image = fluid.data(name='image', shape=[None, IMAGE_SIZE], dtype='float32') - label = fluid.data(name='label', shape=[None, 1], dtype='int64') - - loss = simple_net(image, label) - - exe = fluid.Executor(places[0]) - exe.run(fluid.default_startup_program()) - - prog = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name) + simple_net = SimpleNet() + opt = paddle.optimizer.SGD(learning_rate=1e-3, + parameters=simple_net.parameters()) loader = DataLoader(dataset, - feed_list=[image, label], - batch_size=BATCH_SIZE, + batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2) for e in range(EPOCH_NUM): - for i, data in enumerate(loader()): - l = exe.run(prog, feed=data, fetch_list=[loss], return_numpy=True) - print("Epoch {} batch {}: loss = {}".format(e, i, l[0][0])) + for i, (image, label) in enumerate(loader()): + out = simple_net(image) + loss = F.cross_entropy(out, label) + avg_loss = paddle.mean(loss) + avg_loss.backward() + opt.minimize(avg_loss) + simple_net.clear_gradients() + print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy()))) - # ------------------------------------------------------- - .. note:: For reading iterable dataset with multiprocess Dataloader, @@ -439,6 +398,10 @@ class DataLoader(object): use_multiprocess=False, drop_last=True): """ + .. warning:: + This API will be deprecated in the future, it is recommended to use + :code:`paddle.io.DataLoader` which supports multi-processes acceleration. + .. note:: **The framework ensures that the data loading order of DataLoader is exactly the same as the user-defined data source.** @@ -684,6 +647,10 @@ class DataLoader(object): @staticmethod def from_dataset(dataset, places, drop_last=True): """ + .. warning:: + This API will be deprecated in the future, it is recommended to use + :code:`paddle.io.DataLoader` which supports multi-processes acceleration. + Create an iterable DataLoader object for loading data from Dataset. Dataset is only supported in Linux system currently. -- GitLab