From 2b81d13c7f2b7f93b378bbef9d877f9481e94a90 Mon Sep 17 00:00:00 2001 From: 1want2sleep <116695878+1want2sleep@users.noreply.github.com> Date: Tue, 15 Nov 2022 18:53:16 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=94=B9=E4=BA=86=E5=BC=95=E7=94=A8?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F=20(#47963)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix some docs bugs; test=document_fix * Update batch_sampler.py * Update dataset.py * Update dataset.py * Update sampler.py * for codestyle; test=document_fix * fix copy-from issue; test=document_fix Co-authored-by: Ligoml <39876205+Ligoml@users.noreply.github.com> Co-authored-by: Ligoml --- .../paddle/fluid/dataloader/batch_sampler.py | 43 ++++++++++--------- python/paddle/fluid/dataloader/dataset.py | 21 +++++---- python/paddle/fluid/dataloader/sampler.py | 17 ++++---- 3 files changed, 42 insertions(+), 39 deletions(-) diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py index 5ac1c79d0cd..ff749271e56 100644 --- a/python/paddle/fluid/dataloader/batch_sampler.py +++ b/python/paddle/fluid/dataloader/batch_sampler.py @@ -37,20 +37,20 @@ class BatchSampler(Sampler): Args: - dataset(Dataset): this could be a :code:`paddle.io.Dataset` - implement or other python object which implemented + dataset(Dataset, optional): this should be an instance of a subclass of :ref:`api_paddle_io_Dataset` or + :ref:`api_paddle_io_IterableDataset` or other python object which implemented :code:`__len__` for BatchSampler to get indices as the - range of :attr:`dataset` length. Default None. - sampler (Sampler): this could be a :code:`paddle.io.Dataset` - instance which implemented :code:`__iter__` to yield + range of :attr:`dataset` length. Default None, disabled. + sampler (Sampler, optional): this should be a :ref:`api_paddle_io_Sample` + instance which implemented :code:`__iter__` to generate sample indices. :attr:`sampler` and :attr:`dataset` can not be set in the same time. If :attr:`sampler` - is set, :attr:`shuffle` should not be set. Default None. - shuffle(bool): whether to shuffle indices order before genrating - batch indices. Default False. - batch_size(int): sample indice number in a mini-batch indices. - drop_last(bool): whether drop the last incomplete batch dataset size - is not divisible by the batch size. Default False + is set, :attr:`dataset` should not be set. Default None, disabled. + shuffle(bool, optional): whether to shuffle indices order before generating + batch indices. Default False, don't shuffle indices before generating batch indices. + batch_size(int, optional): sample indice number in a mini-batch indices. default 1, each mini-batch includes 1 sample. + drop_last(bool, optional): whether drop the last incomplete (less than 1 mini-batch) batch dataset. Default False, keep it. + see :ref:`api_paddle_io_DataLoader` Returns: BatchSampler: an iterable object for indices iterating @@ -92,7 +92,6 @@ class BatchSampler(Sampler): print(batch_indices) - see `paddle.io.DataLoader` """ @@ -183,22 +182,24 @@ class DistributedBatchSampler(BatchSampler): Dataset is assumed to be of constant size. Args: - dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement + dataset(Dataset): this could be an instance of subclass of :ref:`api_paddle_io_Dataset` or other python object which implemented - `__len__` for BatchSampler to get sample - number of data source. - batch_size(int): sample indice number in a mini-batch indices. + `__len__` for BatchSampler to get indices of samples. + batch_size(int): sample size of each mini-batch. num_replicas(int, optional): porcess number in distributed training. If :attr:`num_replicas` is None, :attr:`num_replicas` will be - retrieved from :code:`paddle.distributed.ParallenEnv`. + retrieved from :ref:`api_paddle_distributed_ParallelEnv` . Default None. rank(int, optional): the rank of the current process among :attr:`num_replicas` processes. If :attr:`rank` is None, :attr:`rank` is retrieved from - :code:`paddle.distributed.ParallenEnv`. Default None. - shuffle(bool): whther to shuffle indices order before genrating + :ref:`api_paddle_distributed_ParallelEnv`. Default None. + shuffle(bool, optional): whther to shuffle indices order before genrating batch indices. Default False. - drop_last(bool): whether drop the last incomplete batch dataset size - is not divisible by the batch size. Default False + drop_last(bool, optional): whether drop the last incomplete(less than a mini-batch) batch dataset size. + Default False. + + Returns: + DistributedBatchSampler, return an iterable object for indices iterating. Examples: .. code-block:: python diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py index 04e03ec844a..6d62cd9fe0a 100755 --- a/python/paddle/fluid/dataloader/dataset.py +++ b/python/paddle/fluid/dataloader/dataset.py @@ -89,19 +89,20 @@ class IterableDataset(Dataset): An abstract class to encapsulate methods and behaviors of iterable datasets. All datasets in iterable-style (can only get sample one by one sequentially, like - a Python iterator) should be a subclass of `paddle.io.IterableDataset`. All subclasses should + a Python iterator) should be a subclass of :ref:`api_paddle_io_IterableDataset` . All subclasses should implement following methods: - :code:`__iter__`: yield sample sequentially. This method is required by reading dataset sample in :code:`paddle.io.DataLoader`. + :code:`__iter__`: yield sample sequentially. This method is required by reading dataset sample in :ref:`api_paddle_io_DataLoader` . .. note:: do not implement :code:`__getitem__` and :code:`__len__` in IterableDataset, should not be called either. - see :code:`paddle.io.DataLoader`. + see :ref:`api_paddle_io_DataLoader` . Examples: .. code-block:: python + :name: code-example1 import numpy as np from paddle.io import IterableDataset @@ -128,9 +129,10 @@ class IterableDataset(Dataset): among workers as follows. In both the methods, worker information that can be getted in a worker process by `paddle.io.get_worker_info` will be needed. - Example 1: splitting data copy in each worker in :code:`__iter__` + splitting data copy in each worker in :code:`__iter__` .. code-block:: python + :name: code-example2 import math import paddle @@ -169,9 +171,10 @@ class IterableDataset(Dataset): print(data) # outputs: [2, 5, 3, 6, 4, 7] - Example 2: splitting data copy in each worker by :code:`worker_init_fn` + splitting data copy in each worker by :code:`worker_init_fn` .. code-block:: python + :name: code-example3 import math import paddle @@ -370,16 +373,16 @@ class ComposeDataset(Dataset): class ChainDataset(IterableDataset): """ - A Dataset which chains multiple iterable-tyle datasets. + A Dataset which chains multiple iterable-style datasets. This dataset is used for assembling multiple datasets which should - be :code:`paddle.io.IterableDataset`. + be :ref:`api_paddle_io_IterableDataset`. Args: - datasets(list of Dataset): List of datasets to be chainned. + datasets(list of IterableDatasets): List of datasets to be chainned. Returns: - Dataset: A Dataset which chains fields of multiple datasets. + paddle.io.IterableDataset: A Dataset which chains fields of multiple datasets. Examples: diff --git a/python/paddle/fluid/dataloader/sampler.py b/python/paddle/fluid/dataloader/sampler.py index afd8fa7da02..a6ec3ffbae9 100644 --- a/python/paddle/fluid/dataloader/sampler.py +++ b/python/paddle/fluid/dataloader/sampler.py @@ -151,16 +151,16 @@ class RandomSampler(Sampler): Args: data_source(Dataset): dataset to sample, this could be an - instance of :code:`paddle.io.Dataset` other Python - object which implemented :code:`__len__`. - replacement(bool): If False, sample the whole dataset, If False, - set :attr:`num_samples` for how many sample to draw. Default False. - num_samples(int): set sample number to draw if :attr:`replacement` - is True. Default None. - generator(Generator): specify a generator to sample the data source. Default None + instance of :ref:`api_paddle_io_Dataset` or :ref:`api_paddle_io_IterableDataset` or other Python + object which implemented :code:`__len__` to get indices as the range of :code:`dataset` length. Default None. + replacement(bool, optional): If False, sample the whole dataset, If True, + set :attr:`num_samples` for how many samples to draw. Default False. + num_samples(int, optional): set sample number to draw if :attr:`replacement` + is True, then it will take samples according to the number you set. Default None, disabled. + generator(Generator, optional): specify a generator to sample the :code:`data_source`. Default None, disabled. Returns: - Sampler: a Sampler yield sample index randomly + RandomSampler: a Sampler yield sample index randomly. Examples: @@ -185,7 +185,6 @@ class RandomSampler(Sampler): for index in sampler: print(index) - see `paddle.io.Sampler` """ def __init__( -- GitLab