From a37963b89086cee14a1895e6f290ef9b5287bae6 Mon Sep 17 00:00:00 2001 From: mls1999725 <43207078+mls1999725@users.noreply.github.com> Date: Wed, 2 Dec 2020 14:28:54 +0800 Subject: [PATCH] Update APIs in text/datasets and dataloader (#29219) * Update IterableDataset API * Update TensorDataset API * Update APIs in paddle/text/datasets * Update dataset.py --- python/paddle/fluid/dataloader/dataset.py | 73 +++++++++++------------ python/paddle/text/datasets/conll05.py | 1 - python/paddle/text/datasets/imdb.py | 1 - python/paddle/text/datasets/imikolov.py | 1 - python/paddle/text/datasets/movielens.py | 1 - 5 files changed, 34 insertions(+), 43 deletions(-) diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py index 2269a98c4d9..7ae77fe501b 100644 --- a/python/paddle/fluid/dataloader/dataset.py +++ b/python/paddle/fluid/dataloader/dataset.py @@ -126,8 +126,8 @@ class IterableDataset(Dataset): .. code-block:: python import math + import paddle import numpy as np - import paddle.fluid as fluid from paddle.io import IterableDataset, DataLoader, get_worker_info class SplitedIterableDataset(IterableDataset): @@ -151,17 +151,15 @@ class IterableDataset(Dataset): for i in range(iter_start, iter_end): yield np.array([i]) - place = fluid.CPUPlace() - with fluid.dygraph.guard(place): - dataset = SplitedIterableDataset(start=2, end=9) - dataloader = DataLoader( - dataset, - places=place, - num_workers=2, - batch_size=1, - drop_last=True) - - print(list(dataloader)) + dataset = SplitedIterableDataset(start=2, end=9) + dataloader = DataLoader( + dataset, + num_workers=2, + batch_size=1, + drop_last=True) + + for data in dataloader: + print(data) # outputs: [2, 5, 3, 6, 4, 7] Example 2: splitting data copy in each worker by :code:`worker_init_fn` @@ -169,8 +167,8 @@ class IterableDataset(Dataset): .. code-block:: python import math + import paddle import numpy as np - import paddle.fluid as fluid from paddle.io import IterableDataset, DataLoader, get_worker_info class RangeIterableDataset(IterableDataset): @@ -182,33 +180,31 @@ class IterableDataset(Dataset): for i in range(self.start, self.end): yield np.array([i]) - place = fluid.CPUPlace() - with fluid.dygraph.guard(place): - dataset = RangeIterableDataset(start=2, end=9) + dataset = RangeIterableDataset(start=2, end=9) - def worker_init_fn(worker_id): - worker_info = get_worker_info() + def worker_init_fn(worker_id): + worker_info = get_worker_info() - dataset = worker_info.dataset - start = dataset.start - end = dataset.end - num_per_worker = int( - math.ceil((end - start) / float(worker_info.num_workers))) - - worker_id = worker_info.id - dataset.start = start + worker_id * num_per_worker - dataset.end = min(dataset.start + num_per_worker, end) - - dataloader = DataLoader( - dataset, - places=place, - num_workers=2, - batch_size=1, - drop_last=True, - worker_init_fn=worker_init_fn) - - print(list(dataloader)) - # outputs: [2, 5, 3, 6, 4, 7] + dataset = worker_info.dataset + start = dataset.start + end = dataset.end + num_per_worker = int( + math.ceil((end - start) / float(worker_info.num_workers))) + + worker_id = worker_info.id + dataset.start = start + worker_id * num_per_worker + dataset.end = min(dataset.start + num_per_worker, end) + + dataloader = DataLoader( + dataset, + num_workers=2, + batch_size=1, + drop_last=True, + worker_init_fn=worker_init_fn) + + for data in dataloader: + print(data) + # outputs: [2, 5, 3, 6, 4, 7] """ @@ -250,7 +246,6 @@ class TensorDataset(Dataset): import paddle from paddle.io import TensorDataset - paddle.disable_static() input_np = np.random.random([2, 3, 4]).astype('float32') input = paddle.to_tensor(input_np) diff --git a/python/paddle/text/datasets/conll05.py b/python/paddle/text/datasets/conll05.py index 8dd6db656eb..23a2f1c8f28 100644 --- a/python/paddle/text/datasets/conll05.py +++ b/python/paddle/text/datasets/conll05.py @@ -81,7 +81,6 @@ class Conll05st(Dataset): def forward(self, pred_idx, mark, label): return paddle.sum(pred_idx), paddle.sum(mark), paddle.sum(label) - paddle.disable_static() conll05st = Conll05st() diff --git a/python/paddle/text/datasets/imdb.py b/python/paddle/text/datasets/imdb.py index f02b5981906..142c70c953b 100644 --- a/python/paddle/text/datasets/imdb.py +++ b/python/paddle/text/datasets/imdb.py @@ -59,7 +59,6 @@ class Imdb(Dataset): def forward(self, doc, label): return paddle.sum(doc), label - paddle.disable_static() imdb = Imdb(mode='train') diff --git a/python/paddle/text/datasets/imikolov.py b/python/paddle/text/datasets/imikolov.py index cfd437021b9..1a1c625f605 100644 --- a/python/paddle/text/datasets/imikolov.py +++ b/python/paddle/text/datasets/imikolov.py @@ -59,7 +59,6 @@ class Imikolov(Dataset): def forward(self, src, trg): return paddle.sum(src), paddle.sum(trg) - paddle.disable_static() imikolov = Imikolov(mode='train', data_type='SEQ', window_size=2) diff --git a/python/paddle/text/datasets/movielens.py b/python/paddle/text/datasets/movielens.py index 75b59cfbb0d..1f399eebd3b 100644 --- a/python/paddle/text/datasets/movielens.py +++ b/python/paddle/text/datasets/movielens.py @@ -116,7 +116,6 @@ class Movielens(Dataset): def forward(self, category, title, rating): return paddle.sum(category), paddle.sum(title), paddle.sum(rating) - paddle.disable_static() movielens = Movielens(mode='train') -- GitLab