From bf09dcb346c9aa4c20fbfaf520ab781d4f640346 Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Thu, 25 Mar 2021 14:08:22 +0800 Subject: [PATCH] add GPU tensor notice & update default_collate_fn/default_convert_fn. test=develop (#31763) --- python/paddle/fluid/dataloader/collate.py | 47 +++++++++++++++++------ python/paddle/fluid/reader.py | 6 +++ 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/fluid/dataloader/collate.py index ddc010d0428..8e90b308b39 100644 --- a/python/paddle/fluid/dataloader/collate.py +++ b/python/paddle/fluid/dataloader/collate.py @@ -27,24 +27,31 @@ except: def default_collate_fn(batch): """ Default batch collating function for :code:`paddle.io.DataLoader`, - batch should be a list of samples, and each sample should be a list - of fields as follows: + get input data as a list of sample datas, each element in list + if the data of a sample, and sample data should composed of list, + dictionary, string, number, numpy array and paddle.Tensor, this + function will parse input data recursively and stack number, + numpy array and paddle.Tensor datas as batch datas. e.g. for + following input data: + + [{'image': np.array(shape=[3, 224, 224]), 'label': 1}, + {'image': np.array(shape=[3, 224, 224]), 'label': 3}, + {'image': np.array(shape=[3, 224, 224]), 'label': 4}, + {'image': np.array(shape=[3, 224, 224]), 'label': 5},] - [[filed1, filed2, ...], [filed1, filed2, ...], ...] - This default collate function zipped each filed together and stack - each filed as the batch field as follows: + This default collate function zipped each number and numpy array + field together and stack each field as the batch field as follows: + + {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])} - [batch_filed1, batch_filed2, ...] Args: - batch(list of list of numpy array|paddle.Tensor): the batch data, each fields - should be a numpy array, each sample should be a list of - fileds, and batch should be a list of sample. + batch(list of sample data): batch should be a list of sample data. Returns: - a list of numpy array|Paddle.Tensor: collated batch of input batch data, - fields data type as same as fields in each sample. + Batched data: batched each number, numpy array and paddle.Tensor + in input data. """ sample = batch[0] if isinstance(sample, np.ndarray): @@ -75,6 +82,24 @@ def default_collate_fn(batch): def default_convert_fn(batch): + """ + Default batch converting function for :code:`paddle.io.DataLoader`. + get input data as a list of sample datas, each element in list + if the data of a sample, and sample data should composed of list, + dictionary, string, number, numpy array and paddle.Tensor. + + .. note:: + This function is default :attr:`collate_fn` in **Distable + automatic batching** mode, for **Distable automatic batching** + mode, please ses :attr:`paddle.io.DataLoader` + + Args: + batch(list of sample data): batch should be a list of sample data. + + Returns: + Batched data: batched each number, numpy array and paddle.Tensor + in input data. + """ if isinstance(batch, (paddle.Tensor, np.ndarray)): return batch elif isinstance(batch, (str, bytes)): diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index be196b73edd..9f2b2127aa7 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -165,6 +165,12 @@ class DataLoader(object): For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler` + .. note:: + GPU tensor operation is not supported in subprocess currently, + please don't use GPU tensor operations in pipeline which will + be performed in subprocess, such as dataset transforms, collte_fn, + etc. Numpy array and CPU tensor operation is supported. + **Disable automatic batching** In certain cases such as some NLP tasks, instead of automatic batching, -- GitLab