diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/fluid/dataloader/collate.py index ddc010d04280c8d498f835deeff5621420bd9993..8e90b308b393ed04b295eb80ab6272c12f807391 100644 --- a/python/paddle/fluid/dataloader/collate.py +++ b/python/paddle/fluid/dataloader/collate.py @@ -27,24 +27,31 @@ except: def default_collate_fn(batch): """ Default batch collating function for :code:`paddle.io.DataLoader`, - batch should be a list of samples, and each sample should be a list - of fields as follows: + get input data as a list of sample datas, each element in list + if the data of a sample, and sample data should composed of list, + dictionary, string, number, numpy array and paddle.Tensor, this + function will parse input data recursively and stack number, + numpy array and paddle.Tensor datas as batch datas. e.g. for + following input data: + + [{'image': np.array(shape=[3, 224, 224]), 'label': 1}, + {'image': np.array(shape=[3, 224, 224]), 'label': 3}, + {'image': np.array(shape=[3, 224, 224]), 'label': 4}, + {'image': np.array(shape=[3, 224, 224]), 'label': 5},] - [[filed1, filed2, ...], [filed1, filed2, ...], ...] - This default collate function zipped each filed together and stack - each filed as the batch field as follows: + This default collate function zipped each number and numpy array + field together and stack each field as the batch field as follows: + + {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])} - [batch_filed1, batch_filed2, ...] Args: - batch(list of list of numpy array|paddle.Tensor): the batch data, each fields - should be a numpy array, each sample should be a list of - fileds, and batch should be a list of sample. + batch(list of sample data): batch should be a list of sample data. Returns: - a list of numpy array|Paddle.Tensor: collated batch of input batch data, - fields data type as same as fields in each sample. + Batched data: batched each number, numpy array and paddle.Tensor + in input data. """ sample = batch[0] if isinstance(sample, np.ndarray): @@ -75,6 +82,24 @@ def default_collate_fn(batch): def default_convert_fn(batch): + """ + Default batch converting function for :code:`paddle.io.DataLoader`. + get input data as a list of sample datas, each element in list + if the data of a sample, and sample data should composed of list, + dictionary, string, number, numpy array and paddle.Tensor. + + .. note:: + This function is default :attr:`collate_fn` in **Distable + automatic batching** mode, for **Distable automatic batching** + mode, please ses :attr:`paddle.io.DataLoader` + + Args: + batch(list of sample data): batch should be a list of sample data. + + Returns: + Batched data: batched each number, numpy array and paddle.Tensor + in input data. + """ if isinstance(batch, (paddle.Tensor, np.ndarray)): return batch elif isinstance(batch, (str, bytes)): diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index be196b73edd69808812ced0927f5625a281bcf7b..9f2b2127aa7043546e84c8cc0295349108f407f5 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -165,6 +165,12 @@ class DataLoader(object): For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler` + .. note:: + GPU tensor operation is not supported in subprocess currently, + please don't use GPU tensor operations in pipeline which will + be performed in subprocess, such as dataset transforms, collte_fn, + etc. Numpy array and CPU tensor operation is supported. + **Disable automatic batching** In certain cases such as some NLP tasks, instead of automatic batching,