diff --git a/docs/source/index.rst b/docs/source/index.rst index 05366a032b3a548263d1a1fa22b7226d8acbceab..8b4ab8f0b68697820f6ffab30e59a5921bf9cca2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -20,6 +20,8 @@ OneFlow API Reference linalg image optim + utils + Indices and tables diff --git a/docs/source/utils.rst b/docs/source/utils.rst new file mode 100644 index 0000000000000000000000000000000000000000..d9452aaa99a1a048ac0b7437c66ebb4013ab3e8a --- /dev/null +++ b/docs/source/utils.rst @@ -0,0 +1,62 @@ +oneflow.utils +=================================== +Utils +---------------------------------- +.. currentmodule:: oneflow.utils +.. automodule:: oneflow.utils.data + :members: DataLoader, + Dataset, + IterableDataset, + TensorDataset, + ConcatDataset, + Subset, + random_split, + Sampler, + SequentialSampler, + RandomSampler, + SubsetRandomSampler, + BatchSampler + +.. currentmodule:: oneflow.utils +.. automodule:: oneflow.utils.data.distributed + :members: DistributedSampler + +.. currentmodule:: oneflow.utils +.. automodule:: oneflow.utils.vision.datasets + :members: MNIST, + FashionMNIST, + CIFAR10, + CIFAR100, + ImageNet, + CocoCaptions, + CocoDetection, + VOCDetection, + VOCSegmentation, + DatasetFolder, + ImageFolder + +.. currentmodule:: oneflow.utils +.. automodule:: oneflow.utils.vision.transforms + :members: Compose, + ToTensor, + PILToTensor, + ConvertImageDtype, + ToPILImage, + Normalize, + Resize, + Scale, + CenterCrop, + Pad, + Lambda, + RandomTransforms, + RandomApply, + RandomOrder, + RandomChoice, + RandomCrop, + RandomHorizontalFlip, + RandomVerticalFlip, + RandomResizedCrop, + RandomSizedCrop, + FiveCrop, + TenCrop, + InterpolationMode diff --git a/python/oneflow/test/dataloader/data_utils.py b/python/oneflow/test/dataloader/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e747db5063820223c93bb2c2f2eb0fdb1d9e9b40 --- /dev/null +++ b/python/oneflow/test/dataloader/data_utils.py @@ -0,0 +1,148 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import os +import oneflow as flow +import oneflow.utils.vision.transforms as transforms + + +def load_data_cifar10( + batch_size, + data_dir="./data-test/cifar10", + download=True, + transform=None, + source_url=None, + num_workers=0, +): + cifar10_train = flow.utils.vision.datasets.CIFAR10( + root=data_dir, + train=True, + download=download, + transform=transform, + source_url=source_url, + ) + cifar10_test = flow.utils.vision.datasets.CIFAR10( + root=data_dir, + train=False, + download=download, + transform=transform, + source_url=source_url, + ) + + train_iter = flow.utils.data.DataLoader( + cifar10_train, batch_size=batch_size, shuffle=True, num_workers=num_workers + ) + test_iter = flow.utils.data.DataLoader( + cifar10_test, batch_size=batch_size, shuffle=False, num_workers=num_workers + ) + return train_iter, test_iter + + +def load_data_mnist( + batch_size, resize=None, root="./data/mnist", download=True, source_url=None +): + """Download the MNIST dataset and then load into memory.""" + root = os.path.expanduser(root) + transformer = [] + if resize: + transformer += [transforms.Resize(resize)] + transformer += [transforms.ToTensor()] + transformer = transforms.Compose(transformer) + + mnist_train = flow.utils.vision.datasets.MNIST( + root=root, + train=True, + transform=transformer, + download=download, + source_url=source_url, + ) + mnist_test = flow.utils.vision.datasets.MNIST( + root=root, + train=False, + transform=transformer, + download=download, + source_url=source_url, + ) + train_iter = flow.utils.data.DataLoader(mnist_train, batch_size, shuffle=True) + test_iter = flow.utils.data.DataLoader(mnist_test, batch_size, shuffle=False) + return train_iter, test_iter + + +def get_fashion_mnist_dataset( + resize=None, root="./data-test/fashion-mnist", download=True, source_url=None, +): + root = os.path.expanduser(root) + trans = [] + if resize: + trans.append(transforms.Resize(resize)) + trans.append(transforms.ToTensor()) + transform = transforms.Compose(trans) + + mnist_train = flow.utils.vision.datasets.FashionMNIST( + root=root, + train=True, + transform=transform, + download=download, + source_url=source_url, + ) + mnist_test = flow.utils.vision.datasets.FashionMNIST( + root=root, + train=False, + transform=transform, + download=download, + source_url=source_url, + ) + return mnist_train, mnist_test + + +# reference: http://tangshusen.me/Dive-into-DL-PyTorch/#/chapter03_DL-basics/3.10_mlp-pytorch +def load_data_fashion_mnist( + batch_size, + resize=None, + root="./data-test/fashion-mnist", + download=True, + source_url=None, + num_workers=0, +): + """Download the Fashion-MNIST dataset and then load into memory.""" + root = os.path.expanduser(root) + trans = [] + if resize: + trans.append(transforms.Resize(resize)) + trans.append(transforms.ToTensor()) + transform = transforms.Compose(trans) + + mnist_train = flow.utils.vision.datasets.FashionMNIST( + root=root, + train=True, + transform=transform, + download=download, + source_url=source_url, + ) + mnist_test = flow.utils.vision.datasets.FashionMNIST( + root=root, + train=False, + transform=transform, + download=download, + source_url=source_url, + ) + + train_iter = flow.utils.data.DataLoader( + mnist_train, batch_size, shuffle=True, num_workers=num_workers + ) + test_iter = flow.utils.data.DataLoader( + mnist_test, batch_size, shuffle=False, num_workers=num_workers + ) + return train_iter, test_iter diff --git a/python/oneflow/test/dataloader/test_cifar_dataset.py b/python/oneflow/test/dataloader/test_cifar_dataset.py index dd057a587a440d41cfb20fbc2d05bb717596600f..f92a1f4f95b259b33b0d69ac14bdec8412c23802 100644 --- a/python/oneflow/test/dataloader/test_cifar_dataset.py +++ b/python/oneflow/test/dataloader/test_cifar_dataset.py @@ -20,6 +20,7 @@ import oneflow.unittest import oneflow as flow import oneflow.nn as nn import oneflow.optim as optim +from data_utils import load_data_cifar10 classes = ( @@ -81,21 +82,19 @@ def test(test_case): os.getenv("ONEFLOW_TEST_CACHE_DIR", "./data-test"), "cifar10" ) - trainset = flow.utils.vision.datasets.CIFAR10( - root=data_dir, - train=True, + train_iter, test_iter = load_data_cifar10( + batch_size=batch_size, + data_dir=data_dir, download=True, transform=transform, source_url="https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/cifar/cifar-10-python.tar.gz", - ) - trainloader = flow.utils.data.DataLoader( - trainset, batch_size=batch_size, shuffle=False, num_workers=0 + num_workers=0, ) final_loss = 0 for epoch in range(1, train_epoch + 1): # loop over the dataset multiple times running_loss = 0.0 - for i, data in enumerate(trainloader, 1): + for i, data in enumerate(train_iter, 1): # get the inputs; data is a list of [inputs, labels] inputs, labels = data inputs = inputs.to(dtype=flow.float32, device=device) @@ -130,10 +129,3 @@ class TestCifarDataset(flow.unittest.TestCase): if __name__ == "__main__": unittest.main() - # 1 epoch training log - # epoch: 1 step: 2000 loss: 2.107 - # epoch: 1 step: 4000 loss: 1.838 - # epoch: 1 step: 6000 loss: 1.644 - # epoch: 1 step: 8000 loss: 1.535 - # epoch: 1 step: 10000 loss: 1.528 - # epoch: 1 step: 12000 loss: 1.476 diff --git a/python/oneflow/test/dataloader/test_fashion_mnist_dataset.py b/python/oneflow/test/dataloader/test_fashion_mnist_dataset.py index 83dbc194b48a7d37dcb68f25b17004ad50029884..257391aa02e32ce41060a2e32acb451cf30b6401 100644 --- a/python/oneflow/test/dataloader/test_fashion_mnist_dataset.py +++ b/python/oneflow/test/dataloader/test_fashion_mnist_dataset.py @@ -20,42 +20,7 @@ import time import oneflow.unittest import oneflow as flow import oneflow.nn as nn - - -# reference: http://tangshusen.me/Dive-into-DL-PyTorch/#/chapter03_DL-basics/3.10_mlp-pytorch -def load_data_fashion_mnist( - batch_size, resize=None, root="./data/fashion-mnist", download=True, source_url=None -): - """Download the Fashion-MNIST dataset and then load into memory.""" - root = os.path.expanduser(root) - transformer = [] - if resize: - transformer += [flow.utils.vision.transforms.Resize(resize)] - transformer += [flow.utils.vision.transforms.ToTensor()] - transformer = flow.utils.vision.transforms.Compose(transformer) - - mnist_train = flow.utils.vision.datasets.FashionMNIST( - root=root, - train=True, - transform=transformer, - download=download, - source_url=source_url, - ) - mnist_test = flow.utils.vision.datasets.FashionMNIST( - root=root, - train=False, - transform=transformer, - download=download, - source_url=source_url, - ) - num_workers = 0 - train_iter = flow.utils.data.DataLoader( - mnist_train, batch_size, shuffle=True, num_workers=num_workers - ) - test_iter = flow.utils.data.DataLoader( - mnist_test, batch_size, shuffle=False, num_workers=num_workers - ) - return train_iter, test_iter +from data_utils import load_data_fashion_mnist def get_fashion_mnist_labels(labels): @@ -124,7 +89,7 @@ def test(test_case): ) source_url = "https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist/Fashion-MNIST/" train_iter, test_iter = load_data_fashion_mnist( - batch_size, root=data_dir, download=True, source_url=source_url + batch_size, resize=None, root=data_dir, download=True, source_url=source_url ) loss = nn.CrossEntropyLoss() loss.to(device) @@ -174,6 +139,3 @@ class TestFashionMnistDataset(flow.unittest.TestCase): if __name__ == "__main__": unittest.main() - # 1 epoch training log - # epoch 1, loss 0.0034, train acc 0.718, test acc 0.771, cost >>>>>>> 158.32699990272522(s) - # epoch 2, loss 0.0022, train acc 0.807, test acc 0.726, cost >>>>>>> 159.64465260505676(s) diff --git a/python/oneflow/test/dataloader/test_lenet.py b/python/oneflow/test/dataloader/test_lenet.py index 51a929a518feafdf9435fe40dc95eb437441ca8f..1e831aab413ac2543e11f3d3e514a3cde4d214c5 100644 --- a/python/oneflow/test/dataloader/test_lenet.py +++ b/python/oneflow/test/dataloader/test_lenet.py @@ -20,6 +20,7 @@ import unittest import oneflow as flow import oneflow.nn as nn import oneflow.unittest +from data_utils import load_data_fashion_mnist # reference: http://tangshusen.me/Dive-into-DL-PyTorch/#/chapter05_CNN/5.5_lenet @@ -49,46 +50,6 @@ class LeNet(nn.Module): return output -def load_data_fashion_mnist( - batch_size, - resize=None, - root="./data-test/fashion-mnist", - download=True, - source_url=None, - num_workers=0, -): - """Download the Fashion-MNIST dataset and then load into memory.""" - root = os.path.expanduser(root) - trans = [] - if resize: - trans.append(flow.utils.vision.transforms.Resize(resize)) - trans.append(flow.utils.vision.transforms.ToTensor()) - transform = flow.utils.vision.transforms.Compose(trans) - - mnist_train = flow.utils.vision.datasets.FashionMNIST( - root=root, - train=True, - transform=transform, - download=download, - source_url=source_url, - ) - mnist_test = flow.utils.vision.datasets.FashionMNIST( - root=root, - train=False, - transform=transform, - download=download, - source_url=source_url, - ) - - train_iter = flow.utils.data.DataLoader( - mnist_train, batch_size, shuffle=True, num_workers=num_workers - ) - test_iter = flow.utils.data.DataLoader( - mnist_test, batch_size, shuffle=False, num_workers=num_workers - ) - return train_iter, test_iter - - def evaluate_accuracy(data_iter, net, device=None): if device is None and isinstance(net, nn.Module): device = list(net.parameters())[0].device @@ -176,8 +137,3 @@ class TestLenet(flow.unittest.TestCase): if __name__ == "__main__": unittest.main() - # 1 epoch training log - # epoch 1, loss 1.1473, train acc 0.569, test acc 0.742, time 162.4 sec - # epoch 2, loss 0.5736, train acc 0.784, test acc 0.796, time 158.1 sec - # epoch 3, loss 0.4761, train acc 0.826, test acc 0.821, time 154.0 sec - # epoch 4, loss 0.4215, train acc 0.848, test acc 0.855, time 160.3 sec diff --git a/python/oneflow/test/dataloader/test_mnist_dataset.py b/python/oneflow/test/dataloader/test_mnist_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..30cb54713df736185ab4cc4e3b3d60558e3dcc0c --- /dev/null +++ b/python/oneflow/test/dataloader/test_mnist_dataset.py @@ -0,0 +1,119 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import os +import unittest + +import oneflow.unittest +import oneflow as flow +import oneflow.nn as nn +import oneflow.utils.vision.transforms as transforms +from data_utils import load_data_mnist + + +data_dir = os.path.join( + os.getenv("ONEFLOW_TEST_CACHE_DIR", "./data-test"), "mnist-dataset" +) +train_iter, test_iter = load_data_mnist( + batch_size=128, + download=True, + root=data_dir, + source_url="https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist/MNIST/", +) + + +def evaluate_accuracy(data_iter, net, device=None): + n_correct, n_samples = 0.0, 0 + net.to(device) + net.eval() + with flow.no_grad(): + for images, labels in data_iter: + images = images.reshape(-1, 28 * 28) + images = images.to(device=device) + labels = labels.to(device=device) + n_correct += (net(images).argmax(dim=1).numpy() == labels.numpy()).sum() + n_samples += images.shape[0] + net.train() + return n_correct / n_samples + + +class Net(nn.Module): + def __init__( + self, input_size=784, hidden_size1=128, hidden_size2=64, num_classes=10 + ): + super(Net, self).__init__() + self.l1 = nn.Linear(input_size, hidden_size1) + self.relu1 = nn.ReLU() + self.l2 = nn.Linear(hidden_size1, hidden_size2) + self.relu2 = nn.ReLU() + self.l3 = nn.Linear(hidden_size2, num_classes) + + def forward(self, x): + out = self.l1(x) + out = self.relu1(out) + out = self.l2(out) + out = self.relu2(out) + out = self.l3(out) + return out + + +def test_train_and_eval(test_case): + if os.getenv("ONEFLOW_TEST_CPU_ONLY"): + device = flow.device("cpu") + else: + device = flow.device("cuda") + + model = Net() + model.to(device) + + loss = nn.CrossEntropyLoss().to(device) + optimizer = flow.optim.SGD(model.parameters(), lr=0.10) + + num_epochs = 1 + for epoch in range(num_epochs): + train_loss, n_correct, n_samples = 0.0, 0.0, 0 + for images, labels in train_iter: + images = images.reshape(-1, 28 * 28) + images = images.to(device=device) + labels = labels.to(device=device) + features = model(images) + l = loss(features, labels).sum() + optimizer.zero_grad() + l.backward() + optimizer.step() + + train_loss += l.numpy() + n_correct += (features.argmax(dim=1).numpy() == labels.numpy()).sum() + n_samples += images.shape[0] + if n_samples > 2000: + break + + test_acc = evaluate_accuracy(test_iter, model, device) + train_acc = n_correct / n_samples + print( + "epoch %d, train loss %.4f, train acc %.3f, test acc %.3f" + % (epoch + 1, train_loss / n_samples, train_acc, test_acc) + ) + # test_case.assertLess(0.8, test_acc) + + +@flow.unittest.skip_unless_1n1d() +class TestMnistDataset(flow.unittest.TestCase): + def test_mnist_dataset(test_case): + test_train_and_eval(test_case) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/oneflow/test/dataloader/test_transforms.py b/python/oneflow/test/dataloader/test_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..5d0737b00f5b414dcf2c318a79dd1575060bec94 --- /dev/null +++ b/python/oneflow/test/dataloader/test_transforms.py @@ -0,0 +1,123 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import os +import unittest + +import oneflow as flow +import oneflow.nn as nn +import oneflow.optim as optim +import oneflow.utils.vision.transforms as transforms +import oneflow.unittest +from data_utils import load_data_cifar10 + + +class Net(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(flow.F.relu(self.conv1(x))) + x = self.pool(flow.F.relu(self.conv2(x))) + x = flow.flatten(x, 1) # flatten all dimensions except batch + x = flow.F.relu(self.fc1(x)) + x = flow.F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +def test(test_case): + if os.getenv("ONEFLOW_TEST_CPU_ONLY"): + device = flow.device("cpu") + else: + device = flow.device("cuda") + net = Net() + net.to(device) + + optimizer = optim.SGD(net.parameters(), lr=0.002, momentum=0.9) + criterion = nn.CrossEntropyLoss() + criterion.to(device) + + transform = flow.utils.vision.transforms.Compose( + [ + transforms.Pad(10), + transforms.RandomHorizontalFlip(p=0.5), + transforms.RandomVerticalFlip(p=0.5), + transforms.CenterCrop(32), + transforms.Resize([32, 32]), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ] + ) + + train_epoch = 1 + batch_size = 4 + data_dir = os.path.join( + os.getenv("ONEFLOW_TEST_CACHE_DIR", "./data-test"), "cifar10" + ) + + train_iter, test_iter = load_data_cifar10( + batch_size=batch_size, + data_dir=data_dir, + download=True, + transform=transform, + source_url="https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/cifar/cifar-10-python.tar.gz", + num_workers=0, + ) + + final_loss = 0 + for epoch in range(1, train_epoch + 1): # loop over the dataset multiple times + running_loss = 0.0 + for i, data in enumerate(train_iter, 1): + # get the inputs; data is a list of [inputs, labels] + inputs, labels = data + inputs = inputs.to(dtype=flow.float32, device=device) + labels = labels.to(dtype=flow.int64, device=device) + + # zero the parameter gradients + optimizer.zero_grad() + + # forward + backward + optimize + outputs = net(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + # print statistics + running_loss += loss.numpy() + # print every 2000 mini-batches + if i % 2000 == 0: + final_loss = running_loss / 2000 + print("epoch: %d step: %5d loss: %.3f " % (epoch, i, final_loss)) + running_loss = 0.0 + + print("final loss : ", final_loss) + # test_case.assertLess(final_loss, 1.79) + + +@flow.unittest.skip_unless_1n1d() +class TestCifarDataset(flow.unittest.TestCase): + def test_cifar_dataset(test_case): + test(test_case) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/oneflow/utils/data/__init__.py b/python/oneflow/utils/data/__init__.py index f8ff04ba35c444f98f3eeaf0dd31fcdec9df3e9d..fb8219c4846dad972d0fa707e4f14b135ddbc670 100644 --- a/python/oneflow/utils/data/__init__.py +++ b/python/oneflow/utils/data/__init__.py @@ -35,6 +35,7 @@ from oneflow.utils.data.decorator import ( guaranteed_datapipes_determinism, non_deterministic, ) +from oneflow.utils.data.distributed import DistributedSampler __all__ = [ @@ -55,4 +56,5 @@ __all__ = [ "functional_datapipe", "guaranteed_datapipes_determinism", "non_deterministic", + "DistributedSampler", ] diff --git a/python/oneflow/utils/data/dataloader.py b/python/oneflow/utils/data/dataloader.py index 6d2051c6e47c62c3d4c18758f004196509a05f7e..003353b4727054eec89d21f79427a721d5702e3d 100644 --- a/python/oneflow/utils/data/dataloader.py +++ b/python/oneflow/utils/data/dataloader.py @@ -163,9 +163,7 @@ class DataLoader(Generic[T_co]): .. warning:: If the ``spawn`` start method is used, :attr:`worker_init_fn` - cannot be an unpicklable object, e.g., a lambda function. See - :ref:`multiprocessing-best-practices` on more details related - to multiprocessing in OneFlow. + cannot be an unpicklable object, e.g., a lambda function. .. warning:: ``len(dataloader)`` heuristic is based on the length of the sampler used. When :attr:`dataset` is an :class:`~flow.utils.data.IterableDataset`, @@ -181,12 +179,6 @@ class DataLoader(Generic[T_co]): dropped when :attr:`drop_last` is set. Unfortunately, OneFlow can not detect such cases in general. - See `Dataset Types`_ for more details on these two types of datasets and how - :class:`~flow.utils.data.IterableDataset` interacts with - `Multi-process data loading`_. - - .. warning:: See :ref:`reproducibility`, and :ref:`dataloader-workers-random-seed`, and - :ref:`data-loading-randomness` notes for random seed related questions. """ dataset: Dataset[T_co] batch_size: Optional[int] diff --git a/python/oneflow/utils/data/dataset.py b/python/oneflow/utils/data/dataset.py index 3573fdf8a81a47cd05c777541ef2eeb0ff34f9b3..db792720aa26c806a483ea4f45b1b1d3f67546dd 100644 --- a/python/oneflow/utils/data/dataset.py +++ b/python/oneflow/utils/data/dataset.py @@ -195,7 +195,6 @@ class TensorDataset(Dataset[Tuple[Tensor, ...]]): Args: *tensors (Tensor): tensors that have the same size of the first dimension. """ - tensors: Tuple[Tensor, ...] def __init__(self, *tensors: Tensor) -> None: assert all( diff --git a/python/oneflow/utils/data/distributed.py b/python/oneflow/utils/data/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..832c928f1d86c24c3515f7a2c9128e20db22af11 --- /dev/null +++ b/python/oneflow/utils/data/distributed.py @@ -0,0 +1,169 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import math +import numpy as np +from typing import TypeVar, Optional, Iterator + +import oneflow as flow +import oneflow.distributed as dist +from oneflow.utils.data import Sampler, Dataset + + +T_co = TypeVar("T_co", covariant=True) + + +class DistributedSampler(Sampler[T_co]): + r"""Sampler that restricts data loading to a subset of the dataset. + + It is especially useful in conjunction with + :class:`flow.nn.parallel.DistributedDataParallel`. In such a case, each + process can pass a :class:`~flow.utils.data.DistributedSampler` instance as a + :class:`~flow.utils.data.DataLoader` sampler, and load a subset of the + original dataset that is exclusive to it. + + .. note:: + Dataset is assumed to be of constant size. + + Args: + dataset: Dataset used for sampling. + num_replicas (int, optional): Number of processes participating in + distributed training. By default, :attr:`world_size` is retrieved from the + current distributed group. + rank (int, optional): Rank of the current process within :attr:`num_replicas`. + By default, :attr:`rank` is retrieved from the current distributed + group. + shuffle (bool, optional): If ``True`` (default), sampler will shuffle the + indices. + seed (int, optional): random seed used to shuffle the sampler if + :attr:`shuffle=True`. This number should be identical across all + processes in the distributed group. Default: ``0``. + drop_last (bool, optional): if ``True``, then the sampler will drop the + tail of the data to make it evenly divisible across the number of + replicas. If ``False``, the sampler will add extra indices to make + the data evenly divisible across the replicas. Default: ``False``. + + .. warning:: + In distributed mode, calling the :meth:`set_epoch` method at + the beginning of each epoch **before** creating the :class:`DataLoader` iterator + is necessary to make shuffling work properly across multiple epochs. Otherwise, + the same ordering will be always used. + + For example: + + .. code-block:: python + + >>> sampler = DistributedSampler(dataset) if is_distributed else None + >>> loader = DataLoader(dataset, shuffle=(sampler is None), sampler=sampler) + >>> for epoch in range(start_epoch, n_epochs): + ... if is_distributed: + ... sampler.set_epoch(epoch) + ... train(loader) + """ + + def __init__( + self, + dataset: Dataset, + num_replicas: Optional[int] = None, + rank: Optional[int] = None, + shuffle: bool = True, + seed: int = 0, + drop_last: bool = False, + ) -> None: + if not dist.is_multi_client(): + raise RuntimeError("Requires multi-client env to be available") + + if num_replicas is None: + num_replicas = dist.get_world_size() + if rank is None: + rank = dist.get_rank() + print( + "dist.get_world_size() >>>>> ", + dist.get_world_size(), + "dist.get_rank() >>>>>", + dist.get_rank(), + ) + if rank >= num_replicas or rank < 0: + raise ValueError( + "Invalid rank {}, rank should be in the interval" + " [0, {}]".format(rank, num_replicas - 1) + ) + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.drop_last = drop_last + # If the dataset length is evenly divisible by # of replicas, then there + # is no need to drop any data, since the dataset will be split equally. + if self.drop_last and len(self.dataset) % self.num_replicas != 0: + # Split to nearest available length that is evenly divisible. + # This is to ensure each rank receives the same amount of data when + # using this Sampler. + self.num_samples = math.ceil( + # `type:ignore` is required because Dataset cannot provide a default __len__ + (len(self.dataset) - self.num_replicas) + / self.num_replicas + ) + else: + self.num_samples = math.ceil(len(self.dataset) / self.num_replicas) + self.total_size = self.num_samples * self.num_replicas + self.shuffle = shuffle + self.seed = seed + + def __iter__(self) -> Iterator[T_co]: + if self.shuffle: + # deterministically shuffle based on epoch and seed + # TODO:replace with flow.randperm + g = flow.Generator() + g.manual_seed(self.seed + self.epoch) + # indices = flow.randperm(len(self.dataset), generator=g).tolist() + indices = np.random.permutation(len(self.dataset)).tolist() + + else: + indices = list(range(len(self.dataset))) + + if not self.drop_last: + # add extra samples to make it evenly divisible + padding_size = self.total_size - len(indices) + if padding_size <= len(indices): + indices += indices[:padding_size] + else: + indices += (indices * math.ceil(padding_size / len(indices)))[ + :padding_size + ] + else: + # remove tail of data to make it evenly divisible. + indices = indices[: self.total_size] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank : self.total_size : self.num_replicas] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self) -> int: + return self.num_samples + + def set_epoch(self, epoch: int) -> None: + """Sets the epoch for this sampler. + When :attr:`shuffle=True`, this ensures all replicas use a different random + ordering for each epoch. Otherwise, the next iteration of this sampler + will yield the same ordering. + + Args: + epoch (int): Epoch number. + """ + self.epoch = epoch diff --git a/python/oneflow/utils/vision/__init__.py b/python/oneflow/utils/vision/__init__.py index cae7c6a8dc7703b09d568b427e7e61b23065107e..dec08e4b05b41dd81d160d7e41ed3c728b977af9 100644 --- a/python/oneflow/utils/vision/__init__.py +++ b/python/oneflow/utils/vision/__init__.py @@ -15,3 +15,28 @@ limitations under the License. """ from oneflow.utils.vision import datasets from oneflow.utils.vision import transforms + +_image_backend = "PIL" + + +def set_image_backend(backend): + """ + Specifies the package used to load images. + Args: + backend (string): Name of the image backend. one of {'PIL', 'accimage'}. + The :mod:`accimage` package uses the Intel IPP library. It is + generally faster than PIL, but does not support as many operations. + """ + global _image_backend + if backend not in ["PIL", "accimage"]: + raise ValueError( + "Invalid backend '{}'. Options are 'PIL' and 'accimage'".format(backend) + ) + _image_backend = backend + + +def get_image_backend(): + """ + Gets the name of the package used to load images + """ + return _image_backend diff --git a/python/oneflow/utils/vision/datasets/__init__.py b/python/oneflow/utils/vision/datasets/__init__.py index a7227ea8bad6296d7bcb8e4cd04ad5a7bf7d272a..abfacb0ee26b4cd338d8631e527e842de8ce0e4b 100644 --- a/python/oneflow/utils/vision/datasets/__init__.py +++ b/python/oneflow/utils/vision/datasets/__init__.py @@ -15,5 +15,21 @@ limitations under the License. """ from .mnist import MNIST, FashionMNIST from .cifar import CIFAR10, CIFAR100 +from .coco import CocoCaptions, CocoDetection +from .imagenet import ImageNet +from .voc import VOCDetection, VOCSegmentation +from .folder import DatasetFolder, ImageFolder -__all__ = ["MNIST", "FashionMNIST", "CIFAR10", "CIFAR100"] +__all__ = [ + "MNIST", + "FashionMNIST", + "CIFAR10", + "CIFAR100", + "CocoCaptions", + "CocoDetection", + "ImageNet", + "VOCDetection", + "VOCSegmentation", + "DatasetFolder", + "ImageFolder", +] diff --git a/python/oneflow/utils/vision/datasets/cifar.py b/python/oneflow/utils/vision/datasets/cifar.py index a9aa715c2df0f19ad3ac5cf1dca441173389aad9..5f1670693a19154c0b3c14d7bccf4ddeaf54c7bb 100644 --- a/python/oneflow/utils/vision/datasets/cifar.py +++ b/python/oneflow/utils/vision/datasets/cifar.py @@ -25,8 +25,10 @@ from .utils import check_integrity, download_and_extract_archive class CIFAR10(VisionDataset): - """`CIFAR10 `_ Dataset. + r""" `CIFAR10 `_ Dataset. + Args: + root (string): Root directory of dataset where directory ``cifar-10-batches-py`` exists or will be saved to if download is set to True. train (bool, optional): If True, creates dataset from training set, otherwise @@ -39,7 +41,6 @@ class CIFAR10(VisionDataset): puts it in root directory. If dataset is already downloaded, it is not downloaded again. """ - base_folder = "cifar-10-batches-py" url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" filename = "cifar-10-python.tar.gz" @@ -128,6 +129,7 @@ class CIFAR10(VisionDataset): """ Args: index (int): Index + Returns: tuple: (image, target) where target is index of the target class. """ @@ -170,10 +172,10 @@ class CIFAR10(VisionDataset): class CIFAR100(CIFAR10): - """`CIFAR100 `_ Dataset. + r""" `CIFAR100 `_ Dataset. + This is a subclass of the `CIFAR10` Dataset. """ - base_folder = "cifar-100-python" url = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz" filename = "cifar-100-python.tar.gz" diff --git a/python/oneflow/utils/vision/datasets/coco.py b/python/oneflow/utils/vision/datasets/coco.py new file mode 100644 index 0000000000000000000000000000000000000000..c468d2924ae918ef88b56131a72c6332b2df0394 --- /dev/null +++ b/python/oneflow/utils/vision/datasets/coco.py @@ -0,0 +1,112 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from PIL import Image +import os +import os.path +from typing import Any, Callable, Optional, Tuple, List + +from .vision import VisionDataset + + +class CocoDetection(VisionDataset): + r"""`MS Coco Detection `_ Dataset. + + Args: + root (string): Root directory where images are downloaded to. + annFile (string): Path to json annotation file. + transform (callable, optional): A function/transform that takes in an PIL image + and returns a transformed version. E.g, ``transforms.ToTensor`` + target_transform (callable, optional): A function/transform that takes in the + target and transforms it. + transforms (callable, optional): A function/transform that takes input sample and its target as entry + and returns a transformed version. + """ + + def __init__( + self, + root: str, + annFile: str, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + transforms: Optional[Callable] = None, + ): + super().__init__(root, transforms, transform, target_transform) + from pycocotools.coco import COCO + + self.coco = COCO(annFile) + self.ids = list(sorted(self.coco.imgs.keys())) + + def _load_image(self, id: int) -> Image.Image: + path = self.coco.loadImgs(id)[0]["file_name"] + return Image.open(os.path.join(self.root, path)).convert("RGB") + + def _load_target(self, id) -> List[Any]: + return self.coco.loadAnns(self.coco.getAnnIds(id)) + + def __getitem__(self, index: int) -> Tuple[Any, Any]: + id = self.ids[index] + image = self._load_image(id) + target = self._load_target(id) + + if self.transforms is not None: + image, target = self.transforms(image, target) + + return image, target + + def __len__(self) -> int: + return len(self.ids) + + +class CocoCaptions(CocoDetection): + r"""`MS Coco Captions `_ Dataset. + + Args: + root (string): Root directory where images are downloaded to. + annFile (string): Path to json annotation file. + transform (callable, optional): A function/transform that takes in an PIL image + and returns a transformed version. E.g, ``transforms.ToTensor`` + target_transform (callable, optional): A function/transform that takes in the + target and transforms it. + transforms (callable, optional): A function/transform that takes input sample and its target as entry + and returns a transformed version. + + Example: + + .. code:: python + + import oneflow.utils.vision.datasets as dset + import oneflow.utils.vision.transforms as transforms + cap = dset.CocoCaptions(root = 'dir where images are', + annFile = 'json annotation file', + transform=transforms.ToTensor()) + print('Number of samples: ', len(cap)) + img, target = cap[3] # load 4th sample + print("Image Size: ", img.size()) + print(target) + + Output: :: + + Number of samples: 82783 + Image Size: (3L, 427L, 640L) + [u'A plane emitting smoke stream flying over a mountain.', + u'A plane darts across a bright blue sky behind a mountain covered in snow', + u'A plane leaves a contrail above the snowy mountain top.', + u'A mountain that has a plane flying overheard in the distance.', + u'A mountain view with a plume of smoke in the background'] + """ + + def _load_target(self, id) -> List[str]: + return [ann["caption"] for ann in super()._load_target(id)] diff --git a/python/oneflow/utils/vision/datasets/folder.py b/python/oneflow/utils/vision/datasets/folder.py new file mode 100644 index 0000000000000000000000000000000000000000..d23722b8428ec2054d426d69e708ae497b04d788 --- /dev/null +++ b/python/oneflow/utils/vision/datasets/folder.py @@ -0,0 +1,346 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import os +import os.path + +from PIL import Image +from typing import Any, Callable, cast, Dict, List, Optional, Tuple + +from .vision import VisionDataset + + +def has_file_allowed_extension(filename: str, extensions: Tuple[str, ...]) -> bool: + """Checks if a file is an allowed extension. + Args: + filename (string): path to a file + extensions (tuple of strings): extensions to consider (lowercase) + Returns: + bool: True if the filename ends with one of given extensions + """ + return filename.lower().endswith(extensions) + + +def is_image_file(filename: str) -> bool: + """Checks if a file is an allowed image extension. + Args: + filename (string): path to a file + Returns: + bool: True if the filename ends with a known image extension + """ + return has_file_allowed_extension(filename, IMG_EXTENSIONS) + + +def find_classes(directory: str) -> Tuple[List[str], Dict[str, int]]: + """Finds the class folders in a dataset. + + See :class:`DatasetFolder` for details. + """ + classes = sorted(entry.name for entry in os.scandir(directory) if entry.is_dir()) + if not classes: + raise FileNotFoundError(f"Couldn't find any class folder in {directory}.") + + class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} + return classes, class_to_idx + + +def make_dataset( + directory: str, + class_to_idx: Optional[Dict[str, int]] = None, + extensions: Optional[Tuple[str, ...]] = None, + is_valid_file: Optional[Callable[[str], bool]] = None, +) -> List[Tuple[str, int]]: + """Generates a list of samples of a form (path_to_sample, class). + + See :class:`DatasetFolder` for details. + Note: The class_to_idx parameter is here optional and will use the logic of the ``find_classes`` function + by default. + """ + directory = os.path.expanduser(directory) + + if class_to_idx is None: + _, class_to_idx = find_classes(directory) + elif not class_to_idx: + raise ValueError( + "'class_to_index' must have at least one entry to collect any samples." + ) + + both_none = extensions is None and is_valid_file is None + both_something = extensions is not None and is_valid_file is not None + if both_none or both_something: + raise ValueError( + "Both extensions and is_valid_file cannot be None or not None at the same time" + ) + + if extensions is not None: + + def is_valid_file(x: str) -> bool: + return has_file_allowed_extension(x, cast(Tuple[str, ...], extensions)) + + is_valid_file = cast(Callable[[str], bool], is_valid_file) + + instances = [] + available_classes = set() + for target_class in sorted(class_to_idx.keys()): + class_index = class_to_idx[target_class] + target_dir = os.path.join(directory, target_class) + if not os.path.isdir(target_dir): + continue + for root, _, fnames in sorted(os.walk(target_dir, followlinks=True)): + for fname in sorted(fnames): + if is_valid_file(fname): + path = os.path.join(root, fname) + item = path, class_index + instances.append(item) + + if target_class not in available_classes: + available_classes.add(target_class) + + empty_classes = set(class_to_idx.keys()) - available_classes + if empty_classes: + msg = ( + f"Found no valid file for the classes {', '.join(sorted(empty_classes))}. " + ) + if extensions is not None: + msg += f"Supported extensions are: {', '.join(extensions)}" + raise FileNotFoundError(msg) + + return instances + + +class DatasetFolder(VisionDataset): + r"""A generic data loader. + This default directory structure can be customized by overriding the + :meth:`find_classes` method. + + Args: + root (string): Root directory path. + loader (callable): A function to load a sample given its path. + extensions (tuple[string]): A list of allowed extensions. + both extensions and is_valid_file should not be passed. + transform (callable, optional): A function/transform that takes in + a sample and returns a transformed version. + E.g, ``transforms.RandomCrop`` for images. + target_transform (callable, optional): A function/transform that takes + in the target and transforms it. + is_valid_file (callable, optional): A function that takes path of a file + and check if the file is a valid file (used to check of corrupt files) + both extensions and is_valid_file should not be passed. + + Attributes: + classes (list): List of the class names sorted alphabetically. + class_to_idx (dict): Dict with items (class_name, class_index). + samples (list): List of (sample path, class_index) tuples + targets (list): The class_index value for each image in the dataset + """ + + def __init__( + self, + root: str, + loader: Callable[[str], Any], + extensions: Optional[Tuple[str, ...]] = None, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + is_valid_file: Optional[Callable[[str], bool]] = None, + ) -> None: + super(DatasetFolder, self).__init__( + root, transform=transform, target_transform=target_transform + ) + classes, class_to_idx = self.find_classes(self.root) + samples = self.make_dataset(self.root, class_to_idx, extensions, is_valid_file) + + self.loader = loader + self.extensions = extensions + + self.classes = classes + self.class_to_idx = class_to_idx + self.samples = samples + self.targets = [s[1] for s in samples] + + @staticmethod + def make_dataset( + directory: str, + class_to_idx: Dict[str, int], + extensions: Optional[Tuple[str, ...]] = None, + is_valid_file: Optional[Callable[[str], bool]] = None, + ) -> List[Tuple[str, int]]: + """Generates a list of samples of a form (path_to_sample, class). + This can be overridden to e.g. read files from a compressed zip file instead of from the disk. + + Args: + directory (str): root dataset directory, corresponding to ``self.root``. + class_to_idx (Dict[str, int]): Dictionary mapping class name to class index. + extensions (optional): A list of allowed extensions. + Either extensions or is_valid_file should be passed. Defaults to None. + is_valid_file (optional): A function that takes path of a file + and checks if the file is a valid file + (used to check of corrupt files) both extensions and + is_valid_file should not be passed. Defaults to None. + + Raises: + ValueError: In case ``class_to_idx`` is empty. + ValueError: In case ``extensions`` and ``is_valid_file`` are None or both are not None. + FileNotFoundError: In case no valid file was found for any class. + + Returns: + List[Tuple[str, int]]: samples of a form (path_to_sample, class) + """ + if class_to_idx is None: + # prevent potential bug since make_dataset() would use the class_to_idx logic of the + # find_classes() function, instead of using that of the find_classes() method, which + # is potentially overridden and thus could have a different logic. + raise ValueError("The class_to_idx parameter cannot be None.") + return make_dataset( + directory, class_to_idx, extensions=extensions, is_valid_file=is_valid_file + ) + + def find_classes(self, directory: str) -> Tuple[List[str], Dict[str, int]]: + """Find the class folders in a dataset structured as follows: + + .. code-block:: shell + + directory/ + ├── class_x + │ ├── xxx.ext + │ ├── xxy.ext + │ └── ... + │ └── xxz.ext + └── class_y + ├── 123.ext + ├── nsdf3.ext + └── ... + └── asd932_.ext + + This method can be overridden to only consider + a subset of classes, or to adapt to a different dataset directory structure. + + Args: + directory(str): Root directory path, corresponding to ``self.root`` + + Raises: + FileNotFoundError: If ``dir`` has no class folders. + + Returns: + (Tuple[List[str], Dict[str, int]]): List of all classes and dictionary mapping each class to an index. + """ + return find_classes(directory) + + def __getitem__(self, index: int) -> Tuple[Any, Any]: + """ + Args: + index (int): Index + Returns: + tuple: (sample, target) where target is class_index of the target class. + """ + path, target = self.samples[index] + sample = self.loader(path) + if self.transform is not None: + sample = self.transform(sample) + if self.target_transform is not None: + target = self.target_transform(target) + + return sample, target + + def __len__(self) -> int: + return len(self.samples) + + +IMG_EXTENSIONS = ( + ".jpg", + ".jpeg", + ".png", + ".ppm", + ".bmp", + ".pgm", + ".tif", + ".tiff", + ".webp", +) + + +def pil_loader(path: str) -> Image.Image: + # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) + with open(path, "rb") as f: + img = Image.open(f) + return img.convert("RGB") + + +# TODO: specify the return type +def accimage_loader(path: str) -> Any: + import accimage + + try: + return accimage.Image(path) + except IOError: + # Potentially a decoding problem, fall back to PIL.Image + return pil_loader(path) + + +def default_loader(path: str) -> Any: + from oneflow.utils.vision import get_image_backend + + if get_image_backend() == "accimage": + return accimage_loader(path) + else: + return pil_loader(path) + + +class ImageFolder(DatasetFolder): + r"""A generic data loader where the images are arranged in this way by default: + + .. code-block:: shell + + root/dog/xxx.png + root/dog/xxy.png + root/dog/[...]/xxz.png + root/cat/123.png + root/cat/nsdf3.png + root/cat/[...]/asd932_.png + + This class inherits from :class:`~vision.datasets.DatasetFolder` so + the same methods can be overridden to customize the dataset. + + Args: + root (string): Root directory path. + transform (callable, optional): A function/transform that takes in an PIL image + and returns a transformed version. E.g, ``transforms.RandomCrop`` + target_transform (callable, optional): A function/transform that takes in the + target and transforms it. + loader (callable, optional): A function to load an image given its path. + is_valid_file (callable, optional): A function that takes path of an Image file + and check if the file is a valid file (used to check of corrupt files) + Attributes: + classes (list): List of the class names sorted alphabetically. + class_to_idx (dict): Dict with items (class_name, class_index). + imgs (list): List of (image path, class_index) tuples + """ + + def __init__( + self, + root: str, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + loader: Callable[[str], Any] = default_loader, + is_valid_file: Optional[Callable[[str], bool]] = None, + ): + super(ImageFolder, self).__init__( + root, + loader, + IMG_EXTENSIONS if is_valid_file is None else None, + transform=transform, + target_transform=target_transform, + is_valid_file=is_valid_file, + ) + self.imgs = self.samples diff --git a/python/oneflow/utils/vision/datasets/imagenet.py b/python/oneflow/utils/vision/datasets/imagenet.py new file mode 100644 index 0000000000000000000000000000000000000000..a89b0925fb6680278d3f5383e20380570228fa6b --- /dev/null +++ b/python/oneflow/utils/vision/datasets/imagenet.py @@ -0,0 +1,259 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import warnings +from contextlib import contextmanager +import os +import shutil +import tempfile +from typing import Any, Dict, List, Iterator, Optional, Tuple + +import oneflow as flow +from .folder import ImageFolder +from .utils import check_integrity, extract_archive, verify_str_arg + +ARCHIVE_META = { + "train": ("ILSVRC2012_img_train.tar", "1d675b47d978889d74fa0da5fadfb00e"), + "val": ("ILSVRC2012_img_val.tar", "29b22e2961454d5413ddabcf34fc5622"), + "devkit": ("ILSVRC2012_devkit_t12.tar.gz", "fa75699e90414af021442c21a62c3abf"), +} + +META_FILE = "meta.bin" + + +class ImageNet(ImageFolder): + r""" `ImageNet `_ 2012 Classification Dataset. + + Args: + root (string): Root directory of the ImageNet Dataset. + split (string, optional): The dataset split, supports ``train``, or ``val``. + transform (callable, optional): A function/transform that takes in an PIL image + and returns a transformed version. E.g, ``transforms.RandomCrop`` + target_transform (callable, optional): A function/transform that takes in the + target and transforms it. + loader (callable, optional): A function to load an image given its path. + + Attributes: + classes (list): List of the class name tuples. + class_to_idx (dict): Dict with items (class_name, class_index). + wnids (list): List of the WordNet IDs. + wnid_to_idx (dict): Dict with items (wordnet_id, class_index). + imgs (list): List of (image path, class_index) tuples + targets (list): The class_index value for each image in the dataset + """ + + def __init__( + self, + root: str, + split: str = "train", + download: Optional[str] = None, + **kwargs: Any + ) -> None: + if download is True: + msg = ( + "The dataset is no longer publicly accessible. You need to " + "download the archives externally and place them in the root " + "directory." + ) + raise RuntimeError(msg) + elif download is False: + msg = ( + "The use of the download flag is deprecated, since the dataset " + "is no longer publicly accessible." + ) + warnings.warn(msg, RuntimeWarning) + + root = self.root = os.path.expanduser(root) + self.split = verify_str_arg(split, "split", ("train", "val")) + + self.parse_archives() + wnid_to_classes = load_meta_file(self.root)[0] + + super(ImageNet, self).__init__(self.split_folder, **kwargs) + self.root = root + + self.wnids = self.classes + self.wnid_to_idx = self.class_to_idx + self.classes = [wnid_to_classes[wnid] for wnid in self.wnids] + self.class_to_idx = { + cls: idx for idx, clss in enumerate(self.classes) for cls in clss + } + + def parse_archives(self) -> None: + if not check_integrity(os.path.join(self.root, META_FILE)): + parse_devkit_archive(self.root) + + if not os.path.isdir(self.split_folder): + if self.split == "train": + parse_train_archive(self.root) + elif self.split == "val": + parse_val_archive(self.root) + + @property + def split_folder(self) -> str: + return os.path.join(self.root, self.split) + + def extra_repr(self) -> str: + return "Split: {split}".format(**self.__dict__) + + +def load_meta_file( + root: str, file: Optional[str] = None +) -> Tuple[Dict[str, str], List[str]]: + if file is None: + file = META_FILE + file = os.path.join(root, file) + + if check_integrity(file): + return flow.load(file) + else: + msg = ( + "The meta file {} is not present in the root directory or is corrupted. " + "This file is automatically created by the ImageNet dataset." + ) + raise RuntimeError(msg.format(file, root)) + + +def _verify_archive(root: str, file: str, md5: str) -> None: + if not check_integrity(os.path.join(root, file), md5): + msg = ( + "The archive {} is not present in the root directory or is corrupted. " + "You need to download it externally and place it in {}." + ) + raise RuntimeError(msg.format(file, root)) + + +def parse_devkit_archive(root: str, file: Optional[str] = None) -> None: + """Parse the devkit archive of the ImageNet2012 classification dataset and save + the meta information in a binary file. + Args: + root (str): Root directory containing the devkit archive + file (str, optional): Name of devkit archive. Defaults to + 'ILSVRC2012_devkit_t12.tar.gz' + """ + import scipy.io as sio + + def parse_meta_mat(devkit_root: str) -> Tuple[Dict[int, str], Dict[str, str]]: + metafile = os.path.join(devkit_root, "data", "meta.mat") + meta = sio.loadmat(metafile, squeeze_me=True)["synsets"] + nums_children = list(zip(*meta))[4] + meta = [ + meta[idx] + for idx, num_children in enumerate(nums_children) + if num_children == 0 + ] + idcs, wnids, classes = list(zip(*meta))[:3] + classes = [tuple(clss.split(", ")) for clss in classes] + idx_to_wnid = {idx: wnid for idx, wnid in zip(idcs, wnids)} + wnid_to_classes = {wnid: clss for wnid, clss in zip(wnids, classes)} + return idx_to_wnid, wnid_to_classes + + def parse_val_groundtruth_txt(devkit_root: str) -> List[int]: + file = os.path.join( + devkit_root, "data", "ILSVRC2012_validation_ground_truth.txt" + ) + with open(file, "r") as txtfh: + val_idcs = txtfh.readlines() + return [int(val_idx) for val_idx in val_idcs] + + @contextmanager + def get_tmp_dir() -> Iterator[str]: + tmp_dir = tempfile.mkdtemp() + try: + yield tmp_dir + finally: + shutil.rmtree(tmp_dir) + + archive_meta = ARCHIVE_META["devkit"] + if file is None: + file = archive_meta[0] + md5 = archive_meta[1] + + _verify_archive(root, file, md5) + + with get_tmp_dir() as tmp_dir: + extract_archive(os.path.join(root, file), tmp_dir) + + devkit_root = os.path.join(tmp_dir, "ILSVRC2012_devkit_t12") + idx_to_wnid, wnid_to_classes = parse_meta_mat(devkit_root) + val_idcs = parse_val_groundtruth_txt(devkit_root) + val_wnids = [idx_to_wnid[idx] for idx in val_idcs] + + flow.save((wnid_to_classes, val_wnids), os.path.join(root, META_FILE)) + + +def parse_train_archive( + root: str, file: Optional[str] = None, folder: str = "train" +) -> None: + """Parse the train images archive of the ImageNet2012 classification dataset and + prepare it for usage with the ImageNet dataset. + Args: + root (str): Root directory containing the train images archive + file (str, optional): Name of train images archive. Defaults to + 'ILSVRC2012_img_train.tar' + folder (str, optional): Optional name for train images folder. Defaults to + 'train' + """ + archive_meta = ARCHIVE_META["train"] + if file is None: + file = archive_meta[0] + md5 = archive_meta[1] + + _verify_archive(root, file, md5) + + train_root = os.path.join(root, folder) + extract_archive(os.path.join(root, file), train_root) + + archives = [os.path.join(train_root, archive) for archive in os.listdir(train_root)] + for archive in archives: + extract_archive(archive, os.path.splitext(archive)[0], remove_finished=True) + + +def parse_val_archive( + root: str, + file: Optional[str] = None, + wnids: Optional[List[str]] = None, + folder: str = "val", +) -> None: + """Parse the validation images archive of the ImageNet2012 classification dataset + and prepare it for usage with the ImageNet dataset. + Args: + root (str): Root directory containing the validation images archive + file (str, optional): Name of validation images archive. Defaults to + 'ILSVRC2012_img_val.tar' + wnids (list, optional): List of WordNet IDs of the validation images. If None + is given, the IDs are loaded from the meta file in the root directory + folder (str, optional): Optional name for validation images folder. Defaults to + 'val' + """ + archive_meta = ARCHIVE_META["val"] + if file is None: + file = archive_meta[0] + md5 = archive_meta[1] + if wnids is None: + wnids = load_meta_file(root)[1] + + _verify_archive(root, file, md5) + + val_root = os.path.join(root, folder) + extract_archive(os.path.join(root, file), val_root) + + images = sorted([os.path.join(val_root, image) for image in os.listdir(val_root)]) + + for wnid in set(wnids): + os.mkdir(os.path.join(val_root, wnid)) + + for wnid, img_file in zip(wnids, images): + shutil.move(img_file, os.path.join(val_root, wnid, os.path.basename(img_file))) diff --git a/python/oneflow/utils/vision/datasets/mnist.py b/python/oneflow/utils/vision/datasets/mnist.py index 76f1eb1a9300a3d64d3a801da3e70b3419248146..9ac513c23e6ee58d847a2549f54ca42f29d5ac63 100644 --- a/python/oneflow/utils/vision/datasets/mnist.py +++ b/python/oneflow/utils/vision/datasets/mnist.py @@ -29,7 +29,8 @@ from oneflow.framework.tensor import Tensor class MNIST(VisionDataset): - """`MNIST `_ Dataset. + r""" `MNIST `_ Dataset. + Args: root (string): Root directory of dataset where ``MNIST/processed/training.pt`` and ``MNIST/processed/test.pt`` exist. @@ -43,7 +44,6 @@ class MNIST(VisionDataset): target_transform (callable, optional): A function/transform that takes in the target and transforms it. """ - mirrors = [ "http://yann.lecun.com/exdb/mnist/", "https://ossci-datasets.s3.amazonaws.com/mnist/", @@ -222,7 +222,8 @@ class MNIST(VisionDataset): class FashionMNIST(MNIST): - """`Fashion-MNIST `_ Dataset. + r""" `Fashion-MNIST `_ Dataset. + Args: root (string): Root directory of dataset where ``FashionMNIST/processed/training.pt`` and ``FashionMNIST/processed/test.pt`` exist. @@ -236,7 +237,6 @@ class FashionMNIST(MNIST): target_transform (callable, optional): A function/transform that takes in the target and transforms it. """ - mirrors = ["http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"] resources = [ diff --git a/python/oneflow/utils/vision/datasets/voc.py b/python/oneflow/utils/vision/datasets/voc.py new file mode 100644 index 0000000000000000000000000000000000000000..ba7d11eb477d52a170cd8339205517e4ac4b2c9d --- /dev/null +++ b/python/oneflow/utils/vision/datasets/voc.py @@ -0,0 +1,256 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import os +import warnings +import collections + +from xml.etree.ElementTree import Element as ET_Element + +try: + from defusedxml.ElementTree import parse as ET_parse +except ImportError: + from xml.etree.ElementTree import parse as ET_parse +from PIL import Image +from typing import Any, Callable, Dict, Optional, Tuple, List + +from .utils import download_and_extract_archive, verify_str_arg +from .vision import VisionDataset + + +DATASET_YEAR_DICT = { + "2012": { + "url": "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar", + "filename": "VOCtrainval_11-May-2012.tar", + "md5": "6cd6e144f989b92b3379bac3b3de84fd", + "base_dir": os.path.join("VOCdevkit", "VOC2012"), + }, + "2011": { + "url": "http://host.robots.ox.ac.uk/pascal/VOC/voc2011/VOCtrainval_25-May-2011.tar", + "filename": "VOCtrainval_25-May-2011.tar", + "md5": "6c3384ef61512963050cb5d687e5bf1e", + "base_dir": os.path.join("TrainVal", "VOCdevkit", "VOC2011"), + }, + "2010": { + "url": "http://host.robots.ox.ac.uk/pascal/VOC/voc2010/VOCtrainval_03-May-2010.tar", + "filename": "VOCtrainval_03-May-2010.tar", + "md5": "da459979d0c395079b5c75ee67908abb", + "base_dir": os.path.join("VOCdevkit", "VOC2010"), + }, + "2009": { + "url": "http://host.robots.ox.ac.uk/pascal/VOC/voc2009/VOCtrainval_11-May-2009.tar", + "filename": "VOCtrainval_11-May-2009.tar", + "md5": "59065e4b188729180974ef6572f6a212", + "base_dir": os.path.join("VOCdevkit", "VOC2009"), + }, + "2008": { + "url": "http://host.robots.ox.ac.uk/pascal/VOC/voc2008/VOCtrainval_14-Jul-2008.tar", + "filename": "VOCtrainval_11-May-2012.tar", + "md5": "2629fa636546599198acfcfbfcf1904a", + "base_dir": os.path.join("VOCdevkit", "VOC2008"), + }, + "2007": { + "url": "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar", + "filename": "VOCtrainval_06-Nov-2007.tar", + "md5": "c52e279531787c972589f7e41ab4ae64", + "base_dir": os.path.join("VOCdevkit", "VOC2007"), + }, + "2007-test": { + "url": "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar", + "filename": "VOCtest_06-Nov-2007.tar", + "md5": "b6e924de25625d8de591ea690078ad9f", + "base_dir": os.path.join("VOCdevkit", "VOC2007"), + }, +} + + +class _VOCBase(VisionDataset): + _SPLITS_DIR: str + _TARGET_DIR: str + _TARGET_FILE_EXT: str + + def __init__( + self, + root: str, + year: str = "2012", + image_set: str = "train", + download: bool = False, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + transforms: Optional[Callable] = None, + ): + super().__init__(root, transforms, transform, target_transform) + if year == "2007-test": + if image_set == "test": + warnings.warn( + "Acessing the test image set of the year 2007 with year='2007-test' is deprecated. " + "Please use the combination year='2007' and image_set='test' instead." + ) + year = "2007" + else: + raise ValueError( + "In the test image set of the year 2007 only image_set='test' is allowed. " + "For all other image sets use year='2007' instead." + ) + self.year = year + + valid_image_sets = ["train", "trainval", "val"] + if year == "2007": + valid_image_sets.append("test") + self.image_set = verify_str_arg(image_set, "image_set", valid_image_sets) + + key = "2007-test" if year == "2007" and image_set == "test" else year + dataset_year_dict = DATASET_YEAR_DICT[key] + + self.url = dataset_year_dict["url"] + self.filename = dataset_year_dict["filename"] + self.md5 = dataset_year_dict["md5"] + + base_dir = dataset_year_dict["base_dir"] + voc_root = os.path.join(self.root, base_dir) + + if download: + download_and_extract_archive( + self.url, self.root, filename=self.filename, md5=self.md5 + ) + + if not os.path.isdir(voc_root): + raise RuntimeError( + "Dataset not found or corrupted. You can use download=True to download it" + ) + + splits_dir = os.path.join(voc_root, "ImageSets", self._SPLITS_DIR) + split_f = os.path.join(splits_dir, image_set.rstrip("\n") + ".txt") + with open(os.path.join(split_f), "r") as f: + file_names = [x.strip() for x in f.readlines()] + + image_dir = os.path.join(voc_root, "JPEGImages") + self.images = [os.path.join(image_dir, x + ".jpg") for x in file_names] + + target_dir = os.path.join(voc_root, self._TARGET_DIR) + self.targets = [ + os.path.join(target_dir, x + self._TARGET_FILE_EXT) for x in file_names + ] + + assert len(self.images) == len(self.targets) + + def __len__(self) -> int: + return len(self.images) + + +class VOCSegmentation(_VOCBase): + r""" `Pascal VOC `_ Segmentation Dataset. + + Args: + root (string): Root directory of the VOC Dataset. + year (string, optional): The dataset year, supports years ``"2007"`` to ``"2012"``. + image_set (string, optional): Select the image_set to use, ``"train"``, ``"trainval"`` or ``"val"``. If + ``year=="2007"``, can also be ``"test"``. + download (bool, optional): If true, downloads the dataset from the internet and + puts it in root directory. If dataset is already downloaded, it is not + downloaded again. + transform (callable, optional): A function/transform that takes in an PIL image + and returns a transformed version. E.g, ``transforms.RandomCrop`` + target_transform (callable, optional): A function/transform that takes in the + target and transforms it. + transforms (callable, optional): A function/transform that takes input sample and its target as entry + and returns a transformed version. + """ + _SPLITS_DIR = "Segmentation" + _TARGET_DIR = "SegmentationClass" + _TARGET_FILE_EXT = ".png" + + @property + def masks(self) -> List[str]: + return self.targets + + def __getitem__(self, index: int) -> Tuple[Any, Any]: + """ + Args: + index (int): Index + Returns: + tuple: (image, target) where target is the image segmentation. + """ + img = Image.open(self.images[index]).convert("RGB") + target = Image.open(self.masks[index]) + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target + + +class VOCDetection(_VOCBase): + r""" `Pascal VOC `_ Detection Dataset. + + Args: + root (string): Root directory of the VOC Dataset. + year (string, optional): The dataset year, supports years ``"2007"`` to ``"2012"``. + image_set (string, optional): Select the image_set to use, ``"train"``, ``"trainval"`` or ``"val"``. If + ``year=="2007"``, can also be ``"test"``. + download (bool, optional): If true, downloads the dataset from the internet and + puts it in root directory. If dataset is already downloaded, it is not + downloaded again. + (default: alphabetic indexing of VOC's 20 classes). + transform (callable, optional): A function/transform that takes in an PIL image + and returns a transformed version. E.g, ``transforms.RandomCrop`` + target_transform (callable, required): A function/transform that takes in the + target and transforms it. + transforms (callable, optional): A function/transform that takes input sample and its target as entry + and returns a transformed version. + """ + _SPLITS_DIR = "Main" + _TARGET_DIR = "Annotations" + _TARGET_FILE_EXT = ".xml" + + @property + def annotations(self) -> List[str]: + return self.targets + + def __getitem__(self, index: int) -> Tuple[Any, Any]: + """ + Args: + index (int): Index + Returns: + tuple: (image, target) where target is a dictionary of the XML tree. + """ + img = Image.open(self.images[index]).convert("RGB") + target = self.parse_voc_xml(ET_parse(self.annotations[index]).getroot()) + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target + + def parse_voc_xml(self, node: ET_Element) -> Dict[str, Any]: + voc_dict: Dict[str, Any] = {} + children = list(node) + if children: + def_dic: Dict[str, Any] = collections.defaultdict(list) + for dc in map(self.parse_voc_xml, children): + for ind, v in dc.items(): + def_dic[ind].append(v) + if node.tag == "annotation": + def_dic["object"] = [def_dic["object"]] + voc_dict = { + node.tag: { + ind: v[0] if len(v) == 1 else v for ind, v in def_dic.items() + } + } + if node.text: + text = node.text.strip() + if not children: + voc_dict[node.tag] = text + return voc_dict diff --git a/python/oneflow/utils/vision/transforms/__init__.py b/python/oneflow/utils/vision/transforms/__init__.py index aa8fae46e6fe9a684c597cd410e480e90ff3fb3a..1424e5744d5200c44a313a0f6336230020a10449 100644 --- a/python/oneflow/utils/vision/transforms/__init__.py +++ b/python/oneflow/utils/vision/transforms/__init__.py @@ -13,6 +13,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ -from .transforms import Normalize, Compose, ToTensor, Resize +from .transforms import ( + Compose, + ToTensor, + PILToTensor, + ConvertImageDtype, + ToPILImage, + Normalize, + Resize, + Scale, + CenterCrop, + Pad, + Lambda, + RandomTransforms, + RandomApply, + RandomOrder, + RandomChoice, + RandomCrop, + RandomHorizontalFlip, + RandomVerticalFlip, + RandomResizedCrop, + RandomSizedCrop, + FiveCrop, + TenCrop, + InterpolationMode, +) -__all__ = ["Normalize", "Compose", "ToTensor", "Resize"] +__all__ = [ + "Compose", + "ToTensor", + "PILToTensor", + "ConvertImageDtype", + "ToPILImage", + "Normalize", + "Resize", + "Scale", + "CenterCrop", + "Pad", + "Lambda", + "RandomTransforms", + "RandomApply", + "RandomOrder", + "RandomChoice", + "RandomCrop", + "RandomHorizontalFlip", + "RandomVerticalFlip", + "RandomResizedCrop", + "RandomSizedCrop", + "FiveCrop", + "TenCrop", + "InterpolationMode", +] diff --git a/python/oneflow/utils/vision/transforms/functional.py b/python/oneflow/utils/vision/transforms/functional.py index d3092113dbd7ca8dd03100fbb6c311a0975849b7..b77e564f01912cedaa7b35edeafdfb0ff59fb34c 100644 --- a/python/oneflow/utils/vision/transforms/functional.py +++ b/python/oneflow/utils/vision/transforms/functional.py @@ -14,11 +14,12 @@ See the License for the specific language governing permissions and limitations under the License. """ import warnings +import numbers from enum import Enum - +from typing import List, Any, Tuple, Optional import numpy as np from PIL import Image -from typing import List, Any +import math try: import accimage @@ -27,13 +28,12 @@ except ImportError: import oneflow as flow from oneflow.framework.tensor import Tensor - from . import functional_pil as F_pil from . import functional_tensor as F_t class InterpolationMode(Enum): - """Interpolation modes + r"""Interpolation modes """ NEAREST = "nearest" @@ -67,6 +67,24 @@ pil_modes_mapping = { } +def _get_image_size(img: Tensor) -> List[int]: + """Returns image size as [w, h] + """ + if isinstance(img, flow.Tensor): + return F_t._get_image_size(img) + + return F_pil._get_image_size(img) + + +def _get_image_num_channels(img: Tensor) -> int: + """Returns number of image channels + """ + if isinstance(img, flow.Tensor): + return F_t._get_image_num_channels(img) + + return F_pil._get_image_num_channels(img) + + def _is_pil_image(img: Any) -> bool: if accimage is not None: return isinstance(img, (Image.Image, accimage.Image)) @@ -140,6 +158,64 @@ def to_tensor(pic): return res +def pil_to_tensor(pic): + """Convert a ``PIL Image`` to a tensor of the same type. + + See :class:`~vision.transforms.PILToTensor` for more details. + + Args: + pic (PIL Image): Image to be converted to tensor. + + Returns: + Tensor: Converted image. + """ + if not F_pil._is_pil_image(pic): + raise TypeError("pic should be PIL Image. Got {}".format(type(pic))) + + if accimage is not None and isinstance(pic, accimage.Image): + # accimage format is always uint8 internally, so always return uint8 here + nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=np.uint8) + pic.copyto(nppic) + return flow.tensor(nppic) + + # handle PIL Image + img = flow.tensor(np.asarray(pic)) + img = img.view(pic.size[1], pic.size[0], len(pic.getbands())) + # put it from HWC to CHW format + img = img.permute((2, 0, 1)) + return img + + +def convert_image_dtype( + image: flow.Tensor, dtype: flow.dtype = flow.float +) -> flow.Tensor: + """Convert a tensor image to the given ``dtype`` and scale the values accordingly + This function does not support PIL Image. + + Args: + image (flow.Tensor): Image to be converted + dtype (flow.dtype): Desired data type of the output + + Returns: + Tensor: Converted image + + .. note:: + + When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly. + If converted back and forth, this mismatch has no effect. + + Raises: + RuntimeError: When trying to cast :class:`flow.float32` to :class:`flow.int32` or :class:`flow.int64` as + well as for trying to cast :class:`flow.float64` to :class:`flow.int64`. These conversions might lead to + overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range + of the integer ``dtype``. + """ + if not isinstance(image, flow.Tensor): + raise TypeError("Input img should be Tensor Image") + + return F_t.convert_image_dtype(image, dtype) + + def normalize( tensor: Tensor, mean: List[float], std: List[float], inplace: bool = False ) -> Tensor: @@ -156,9 +232,7 @@ def normalize( Returns: Tensor: Normalized Tensor image. """ - if not isinstance(tensor, flow.Tensor) and not isinstance( - tensor, flow._oneflow_internal.Tensor - ): + if not isinstance(tensor, flow.Tensor): raise TypeError( "Input tensor should be a oneflow tensor. Got {}.".format(type(tensor)) ) @@ -238,3 +312,423 @@ def resize( return F_pil.resize(img, size=size, interpolation=pil_interpolation) return F_t.resize(img, size=size, interpolation=interpolation.value) + + +def scale(*args, **kwargs): + warnings.warn( + "The use of the transforms.Scale transform is deprecated, " + + "please use transforms.Resize instead." + ) + return resize(*args, **kwargs) + + +def pad( + img: Tensor, padding: List[int], fill: int = 0, padding_mode: str = "constant" +) -> Tensor: + r"""Pad the given image on all sides with the given "pad" value. + If the image is oneflow Tensor, it is expected + to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric, + at most 3 leading dimensions for mode edge, + and an arbitrary number of leading dimensions for mode constant + + Args: + img (PIL Image or Tensor): Image to be padded. + padding (int or sequence): Padding on each border. If a single int is provided this + is used to pad all borders. If sequence of length 2 is provided this is the padding + on left/right and top/bottom respectively. If a sequence of length 4 is provided + this is the padding for the left, top, right and bottom borders respectively. + + fill (number or str or tuple): Pixel fill value for constant fill. Default is 0. + If a tuple of length 3, it is used to fill R, G, B channels respectively. + This value is only used when the padding_mode is constant. + Only number is supported for oneflow Tensor. + Only int or str or tuple value is supported for PIL Image. + padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. + Default is constant. + + - constant: pads with a constant value, this value is specified with fill + + - edge: pads with the last value at the edge of the image. + If input a 5D oneflow Tensor, the last 3 dimensions will be padded instead of the last 2 + + - reflect: pads with reflection of image without repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode + will result in [3, 2, 1, 2, 3, 4, 3, 2] + + - symmetric: pads with reflection of image repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode + will result in [2, 1, 1, 2, 3, 4, 4, 3] + + Returns: + PIL Image or Tensor: Padded image. + """ + if not isinstance(img, flow.Tensor): + return F_pil.pad(img, padding=padding, fill=fill, padding_mode=padding_mode) + + return F_t.pad(img, padding=padding, fill=fill, padding_mode=padding_mode) + + +def crop(img: Tensor, top: int, left: int, height: int, width: int) -> Tensor: + """Crop the given image at specified location and output size. + If the image is oneflow Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + If image size is smaller than output size along any edge, image is padded with 0 and then cropped. + + Args: + img (PIL Image or Tensor): Image to be cropped. (0,0) denotes the top left corner of the image. + top (int): Vertical component of the top left corner of the crop box. + left (int): Horizontal component of the top left corner of the crop box. + height (int): Height of the crop box. + width (int): Width of the crop box. + + Returns: + PIL Image or Tensor: Cropped image. + """ + + if not isinstance(img, flow.Tensor): + return F_pil.crop(img, top, left, height, width) + + return F_t.crop(img, top, left, height, width) + + +def center_crop(img: Tensor, output_size: List[int]) -> Tensor: + """Crops the given image at the center. + If the image is oneflow Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + If image size is smaller than output size along any edge, image is padded with 0 and then center cropped. + + Args: + img (PIL Image or Tensor): Image to be cropped. + output_size (sequence or int): (height, width) of the crop box. If int or sequence with single int, + it is used for both directions. + + Returns: + PIL Image or Tensor: Cropped image. + """ + if isinstance(output_size, numbers.Number): + output_size = (int(output_size), int(output_size)) + elif isinstance(output_size, (tuple, list)) and len(output_size) == 1: + output_size = (output_size[0], output_size[0]) + + image_width, image_height = _get_image_size(img) + crop_height, crop_width = output_size + + if crop_width > image_width or crop_height > image_height: + padding_ltrb = [ + (crop_width - image_width) // 2 if crop_width > image_width else 0, + (crop_height - image_height) // 2 if crop_height > image_height else 0, + (crop_width - image_width + 1) // 2 if crop_width > image_width else 0, + (crop_height - image_height + 1) // 2 if crop_height > image_height else 0, + ] + img = pad(img, padding_ltrb, fill=0) # PIL uses fill value 0 + image_width, image_height = _get_image_size(img) + if crop_width == image_width and crop_height == image_height: + return img + + crop_top = int(round((image_height - crop_height) / 2.0)) + crop_left = int(round((image_width - crop_width) / 2.0)) + return crop(img, crop_top, crop_left, crop_height, crop_width) + + +def resized_crop( + img: Tensor, + top: int, + left: int, + height: int, + width: int, + size: List[int], + interpolation: InterpolationMode = InterpolationMode.BILINEAR, +) -> Tensor: + """Crop the given image and resize it to desired size. + If the image is oneflow Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions + + Notably used in :class:`~vision.transforms.RandomResizedCrop`. + + Args: + img (PIL Image or Tensor): Image to be cropped. (0,0) denotes the top left corner of the image. + top (int): Vertical component of the top left corner of the crop box. + left (int): Horizontal component of the top left corner of the crop box. + height (int): Height of the crop box. + width (int): Width of the crop box. + size (sequence or int): Desired output size. Same semantics as ``resize``. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`vision.transforms.InterpolationMode`. + Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + For backward compatibility integer values (e.g. ``PIL.Image.NEAREST``) are still acceptable. + + Returns: + PIL Image or Tensor: Cropped image. + """ + img = crop(img, top, left, height, width) + img = resize(img, size, interpolation) + return img + + +def hflip(img: Tensor) -> Tensor: + """Horizontally flip the given image. + + Args: + img (PIL Image or Tensor): Image to be flipped. If img + is a Tensor, it is expected to be in [..., H, W] format, + where ... means it can have an arbitrary number of leading + dimensions. + + Returns: + PIL Image or Tensor: Horizontally flipped image. + """ + if not isinstance(img, flow.Tensor): + return F_pil.hflip(img) + + return F_t.hflip(img) + + +def vflip(img: Tensor) -> Tensor: + """Vertically flip the given image. + + Args: + img (PIL Image or Tensor): Image to be flipped. If img + is a Tensor, it is expected to be in [..., H, W] format, + where ... means it can have an arbitrary number of leading + dimensions. + + Returns: + PIL Image or Tensor: Vertically flipped image. + """ + if not isinstance(img, flow.Tensor): + return F_pil.vflip(img) + + return F_t.vflip(img) + + +def five_crop( + img: Tensor, size: List[int] +) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: + """Crop the given image into four corners and the central crop. + If the image is oneflow Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions + + .. Note:: + This transform returns a tuple of images and there may be a + mismatch in the number of inputs and targets your ``Dataset`` returns. + + Args: + img (PIL Image or Tensor): Image to be cropped. + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + + Returns: + tuple: tuple (tl, tr, bl, br, center) + Corresponding top left, top right, bottom left, bottom right and center crop. + """ + if isinstance(size, numbers.Number): + size = (int(size), int(size)) + elif isinstance(size, (tuple, list)) and len(size) == 1: + size = (size[0], size[0]) + + if len(size) != 2: + raise ValueError("Please provide only two dimensions (h, w) for size.") + + image_width, image_height = _get_image_size(img) + crop_height, crop_width = size + if crop_width > image_width or crop_height > image_height: + msg = "Requested crop size {} is bigger than input size {}" + raise ValueError(msg.format(size, (image_height, image_width))) + + tl = crop(img, 0, 0, crop_height, crop_width) + tr = crop(img, 0, image_width - crop_width, crop_height, crop_width) + bl = crop(img, image_height - crop_height, 0, crop_height, crop_width) + br = crop( + img, + image_height - crop_height, + image_width - crop_width, + crop_height, + crop_width, + ) + + center = center_crop(img, [crop_height, crop_width]) + + return tl, tr, bl, br, center + + +def ten_crop(img: Tensor, size: List[int], vertical_flip: bool = False) -> List[Tensor]: + """Generate ten cropped images from the given image. + Crop the given image into four corners and the central crop plus the + flipped version of these (horizontal flipping is used by default). + If the image is oneflow Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions + + .. Note:: + This transform returns a tuple of images and there may be a + mismatch in the number of inputs and targets your ``Dataset`` returns. + + Args: + img (PIL Image or Tensor): Image to be cropped. + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + vertical_flip (bool): Use vertical flipping instead of horizontal + + Returns: + tuple: tuple (tl, tr, bl, br, center, tl_flip, tr_flip, bl_flip, br_flip, center_flip) + Corresponding top left, top right, bottom left, bottom right and + center crop and same for the flipped image. + """ + if isinstance(size, numbers.Number): + size = (int(size), int(size)) + elif isinstance(size, (tuple, list)) and len(size) == 1: + size = (size[0], size[0]) + + if len(size) != 2: + raise ValueError("Please provide only two dimensions (h, w) for size.") + + first_five = five_crop(img, size) + + if vertical_flip: + img = vflip(img) + else: + img = hflip(img) + + second_five = five_crop(img, size) + return first_five + second_five + + +def _get_inverse_affine_matrix( + center: List[float], + angle: float, + translate: List[float], + scale: float, + shear: List[float], +) -> List[float]: + # Helper method to compute inverse matrix for affine transformation + + # As it is explained in PIL.Image.rotate + # We need compute INVERSE of affine transformation matrix: M = T * C * RSS * C^-1 + # where T is translation matrix: [1, 0, tx | 0, 1, ty | 0, 0, 1] + # C is translation matrix to keep center: [1, 0, cx | 0, 1, cy | 0, 0, 1] + # RSS is rotation with scale and shear matrix + # RSS(a, s, (sx, sy)) = + # = R(a) * S(s) * SHy(sy) * SHx(sx) + # = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(x)/cos(y) - sin(a)), 0 ] + # [ s*sin(a + sy)/cos(sy), s*(-sin(a - sy)*tan(x)/cos(y) + cos(a)), 0 ] + # [ 0 , 0 , 1 ] + # + # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears: + # SHx(s) = [1, -tan(s)] and SHy(s) = [1 , 0] + # [0, 1 ] [-tan(s), 1] + # + # Thus, the inverse is M^-1 = C * RSS^-1 * C^-1 * T^-1 + + rot = math.radians(angle) + sx, sy = [math.radians(s) for s in shear] + + cx, cy = center + tx, ty = translate + + # RSS without scaling + a = math.cos(rot - sy) / math.cos(sy) + b = -math.cos(rot - sy) * math.tan(sx) / math.cos(sy) - math.sin(rot) + c = math.sin(rot - sy) / math.cos(sy) + d = -math.sin(rot - sy) * math.tan(sx) / math.cos(sy) + math.cos(rot) + + # Inverted rotation matrix with scale and shear + # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1 + matrix = [d, -b, 0.0, -c, a, 0.0] + matrix = [x / scale for x in matrix] + + # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1 + matrix[2] += matrix[0] * (-cx - tx) + matrix[1] * (-cy - ty) + matrix[5] += matrix[3] * (-cx - tx) + matrix[4] * (-cy - ty) + + # Apply center translation: C * RSS^-1 * C^-1 * T^-1 + matrix[2] += cx + matrix[5] += cy + + return matrix + + +def rotate( + img: Tensor, + angle: float, + interpolation: InterpolationMode = InterpolationMode.NEAREST, + expand: bool = False, + center: Optional[List[int]] = None, + fill: Optional[List[float]] = None, + resample: Optional[int] = None, +) -> Tensor: + """Rotate the image by angle. + If the image is oneflow Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + img (PIL Image or Tensor): image to be rotated. + angle (number): rotation angle value in degrees, counter-clockwise. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`flow.utils.vision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + For backward compatibility integer values (e.g. ``PIL.Image.NEAREST``) are still acceptable. + expand (bool, optional): Optional expansion flag. + If true, expands the output image to make it large enough to hold the entire rotated image. + If false or omitted, make the output image the same size as the input image. + Note that the expand flag assumes rotation around the center and no translation. + center (sequence, optional): Optional center of rotation. Origin is the upper left corner. + Default is the center of the image. + fill (sequence or number, optional): Pixel fill value for the area outside the transformed + image. If given a number, the value is used for all bands respectively. + + + Returns: + PIL Image or Tensor: Rotated image. + + .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters + + """ + if resample is not None: + warnings.warn( + "Argument resample is deprecated and will be removed since v0.10.0. Please, use interpolation instead" + ) + interpolation = _interpolation_modes_from_int(resample) + + # Backward compatibility with integer value + if isinstance(interpolation, int): + warnings.warn( + "Argument interpolation should be of type InterpolationMode instead of int. " + "Please, use InterpolationMode enum." + ) + interpolation = _interpolation_modes_from_int(interpolation) + + if not isinstance(angle, (int, float)): + raise TypeError("Argument angle should be int or float") + + if center is not None and not isinstance(center, (list, tuple)): + raise TypeError("Argument center should be a sequence") + + if not isinstance(interpolation, InterpolationMode): + raise TypeError("Argument interpolation should be a InterpolationMode") + + if not isinstance(img, flow.Tensor): + pil_interpolation = pil_modes_mapping[interpolation] + return F_pil.rotate( + img, + angle=angle, + interpolation=pil_interpolation, + expand=expand, + center=center, + fill=fill, + ) + + center_f = [0.0, 0.0] + if center is not None: + img_size = _get_image_size(img) + # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center. + center_f = [1.0 * (c - s * 0.5) for c, s in zip(center, img_size)] + + # due to current incoherence of rotation angle direction between affine and rotate implementations + # we need to set -angle. + matrix = _get_inverse_affine_matrix(center_f, -angle, [0.0, 0.0], 1.0, [0.0, 0.0]) + raise NotImplementedError("Tensor rotate is not implemented yet!") + return F_t.rotate( + img, matrix=matrix, interpolation=interpolation.value, expand=expand, fill=fill + ) diff --git a/python/oneflow/utils/vision/transforms/functional_pil.py b/python/oneflow/utils/vision/transforms/functional_pil.py index 19e12ea1ec3d661f75e2c50adb76579c22cd0b6c..532d1fe36b2522d27e9a4da3e4e02162375f0695 100644 --- a/python/oneflow/utils/vision/transforms/functional_pil.py +++ b/python/oneflow/utils/vision/transforms/functional_pil.py @@ -13,9 +13,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ +import numbers from typing import Any, List, Sequence +import numpy as np +from PIL import Image, ImageOps -from PIL import Image +import oneflow as flow try: import accimage @@ -30,6 +33,126 @@ def _is_pil_image(img: Any) -> bool: return isinstance(img, Image.Image) +def _get_image_size(img: Any) -> List[int]: + if _is_pil_image(img): + return img.size + raise TypeError("Unexpected type {}".format(type(img))) + + +def _get_image_num_channels(img: Any) -> int: + if _is_pil_image(img): + return 1 if img.mode == "L" else 3 + + +def hflip(img): + if not _is_pil_image(img): + raise TypeError("img should be PIL Image. Got {}".format(type(img))) + + return img.transpose(Image.FLIP_LEFT_RIGHT) + + +def vflip(img): + if not _is_pil_image(img): + raise TypeError("img should be PIL Image. Got {}".format(type(img))) + + return img.transpose(Image.FLIP_TOP_BOTTOM) + + +def pad(img, padding, fill=0, padding_mode="constant"): + if not _is_pil_image(img): + raise TypeError("img should be PIL Image. Got {}".format(type(img))) + + if not isinstance(padding, (numbers.Number, tuple, list)): + raise TypeError("Got inappropriate padding arg") + if not isinstance(fill, (numbers.Number, str, tuple)): + raise TypeError("Got inappropriate fill arg") + if not isinstance(padding_mode, str): + raise TypeError("Got inappropriate padding_mode arg") + + if isinstance(padding, list): + padding = tuple(padding) + + if isinstance(padding, tuple) and len(padding) not in [1, 2, 4]: + raise ValueError( + "Padding must be an int or a 1, 2, or 4 element tuple, not a " + + "{} element tuple".format(len(padding)) + ) + + if isinstance(padding, tuple) and len(padding) == 1: + # Compatibility with `functional_tensor.pad` + padding = padding[0] + + if padding_mode not in ["constant", "edge", "reflect", "symmetric"]: + raise ValueError( + "Padding mode should be either constant, edge, reflect or symmetric" + ) + + if padding_mode == "constant": + opts = _parse_fill(fill, img, name="fill") + if img.mode == "P": + palette = img.getpalette() + image = ImageOps.expand(img, border=padding, **opts) + image.putpalette(palette) + return image + + return ImageOps.expand(img, border=padding, **opts) + else: + if isinstance(padding, int): + pad_left = pad_right = pad_top = pad_bottom = padding + if isinstance(padding, tuple) and len(padding) == 2: + pad_left = pad_right = padding[0] + pad_top = pad_bottom = padding[1] + if isinstance(padding, tuple) and len(padding) == 4: + pad_left = padding[0] + pad_top = padding[1] + pad_right = padding[2] + pad_bottom = padding[3] + + p = [pad_left, pad_top, pad_right, pad_bottom] + cropping = -np.minimum(p, 0) + + if cropping.any(): + crop_left, crop_top, crop_right, crop_bottom = cropping + img = img.crop( + (crop_left, crop_top, img.width - crop_right, img.height - crop_bottom) + ) + + pad_left, pad_top, pad_right, pad_bottom = np.maximum(p, 0) + + if img.mode == "P": + palette = img.getpalette() + img = np.asarray(img) + img = np.pad( + img, ((pad_top, pad_bottom), (pad_left, pad_right)), padding_mode + ) + img = Image.fromarray(img) + img.putpalette(palette) + return img + + img = np.asarray(img) + # RGB image + if len(img.shape) == 3: + img = np.pad( + img, + ((pad_top, pad_bottom), (pad_left, pad_right), (0, 0)), + padding_mode, + ) + # Grayscale image + if len(img.shape) == 2: + img = np.pad( + img, ((pad_top, pad_bottom), (pad_left, pad_right)), padding_mode + ) + + return Image.fromarray(img) + + +def crop(img: Image.Image, top: int, left: int, height: int, width: int) -> Image.Image: + if not _is_pil_image(img): + raise TypeError("img should be PIL Image. Got {}".format(type(img))) + + return img.crop((left, top, left + width, top + height)) + + def resize(img, size, interpolation=Image.BILINEAR): if not _is_pil_image(img): raise TypeError("img should be PIL Image. Got {}".format(type(img))) @@ -54,3 +177,31 @@ def resize(img, size, interpolation=Image.BILINEAR): return img.resize((ow, oh), interpolation) else: return img.resize(size[::-1], interpolation) + + +def _parse_fill(fill, img, name="fillcolor"): + # Process fill color for affine transforms + num_bands = len(img.getbands()) + if fill is None: + fill = 0 + if isinstance(fill, (int, float)) and num_bands > 1: + fill = tuple([fill] * num_bands) + if isinstance(fill, (list, tuple)): + if len(fill) != num_bands: + msg = ( + "The number of elements in 'fill' does not match the number of " + "bands of the image ({} != {})" + ) + raise ValueError(msg.format(len(fill), num_bands)) + + fill = tuple(fill) + + return {name: fill} + + +def rotate(img, angle, interpolation=0, expand=False, center=None, fill=None): + if not _is_pil_image(img): + raise TypeError("img should be PIL Image. Got {}".format(type(img))) + + opts = _parse_fill(fill, img) + return img.rotate(angle, interpolation, expand, center, **opts) diff --git a/python/oneflow/utils/vision/transforms/functional_tensor.py b/python/oneflow/utils/vision/transforms/functional_tensor.py index 3c53907c87138c901344ee82f7bbde8141cf6352..01f3fd3bc13550e421526a516eba3c0f291abab7 100644 --- a/python/oneflow/utils/vision/transforms/functional_tensor.py +++ b/python/oneflow/utils/vision/transforms/functional_tensor.py @@ -13,7 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ -from typing import Tuple, List +import warnings +from typing import Optional, Tuple, List from oneflow.framework.tensor import Tensor import oneflow as flow @@ -43,6 +44,24 @@ def _get_image_num_channels(img: Tensor) -> int: raise TypeError("Input ndim should be 2 or more. Got {}".format(img.ndim)) +def _max_value(dtype: flow.dtype) -> float: + + a = flow.tensor(2, dtype=dtype) + # TODO:Tensor.is_signed() + # signed = 1 if flow.tensor(0, dtype=dtype).is_signed() else 0 + signed = 1 + bits = 1 + max_value = flow.tensor(-signed, dtype=flow.long) + while True: + next_value = a.pow(bits - signed).sub(1) + if next_value > max_value: + max_value = next_value + bits *= 2 + else: + break + return max_value.item() + + def _cast_squeeze_in( img: Tensor, req_dtypes: List[flow.dtype] ) -> Tuple[Tensor, bool, bool, flow.dtype]: @@ -76,6 +95,191 @@ def _cast_squeeze_out( return img +def convert_image_dtype( + image: flow.Tensor, dtype: flow.dtype = flow.float +) -> flow.Tensor: + if image.dtype == dtype: + return image + + if image.is_floating_point(): + # TODO:Tensor.is_floating_point() + if flow.tensor(0, dtype=dtype).is_floating_point(): + return image.to(dtype) + + # float to int + if (image.dtype == flow.float32 and dtype in (flow.int32, flow.int64)) or ( + image.dtype == flow.float64 and dtype == flow.int64 + ): + msg = f"The cast from {image.dtype} to {dtype} cannot be performed safely." + raise RuntimeError(msg) + + # https://github.com/pytorch/vision/pull/2078#issuecomment-612045321 + # For data in the range 0-1, (float * 255).to(uint) is only 255 + # when float is exactly 1.0. + # `max + 1 - epsilon` provides more evenly distributed mapping of + # ranges of floats to ints. + eps = 1e-3 + max_val = _max_value(dtype) + result = image.mul(max_val + 1.0 - eps) + return result.to(dtype) + else: + input_max = _max_value(image.dtype) + + # int to float + if flow.tensor(0, dtype=dtype).is_floating_point(): + image = image.to(dtype) + return image / input_max + + output_max = _max_value(dtype) + + # int to int + if input_max > output_max: + factor = int((input_max + 1) // (output_max + 1)) + image = flow.div(image, factor, rounding_mode="floor") + return image.to(dtype) + else: + factor = int((output_max + 1) // (input_max + 1)) + image = image.to(dtype) + return image * factor + + +def vflip(img: Tensor) -> Tensor: + _assert_image_tensor(img) + + return img.flip(-2) + + +def hflip(img: Tensor) -> Tensor: + _assert_image_tensor(img) + + return img.flip(-1) + + +def crop(img: Tensor, top: int, left: int, height: int, width: int) -> Tensor: + _assert_image_tensor(img) + + w, h = _get_image_size(img) + right = left + width + bottom = top + height + + if left < 0 or top < 0 or right > w or bottom > h: + padding_ltrb = [ + max(-left, 0), + max(-top, 0), + max(right - w, 0), + max(bottom - h, 0), + ] + return pad( + img[..., max(top, 0) : bottom, max(left, 0) : right], padding_ltrb, fill=0 + ) + return img[..., top:bottom, left:right] + + +def _pad_symmetric(img: Tensor, padding: List[int]) -> Tensor: + # padding is left, right, top, bottom + + # crop if needed + if padding[0] < 0 or padding[1] < 0 or padding[2] < 0 or padding[3] < 0: + crop_left, crop_right, crop_top, crop_bottom = [-min(x, 0) for x in padding] + img = img[ + ..., + crop_top : img.shape[-2] - crop_bottom, + crop_left : img.shape[-1] - crop_right, + ] + padding = [max(x, 0) for x in padding] + + in_sizes = img.size() + + x_indices = [i for i in range(in_sizes[-1])] # [0, 1, 2, 3, ...] + left_indices = [i for i in range(padding[0] - 1, -1, -1)] # e.g. [3, 2, 1, 0] + right_indices = [-(i + 1) for i in range(padding[1])] # e.g. [-1, -2, -3] + x_indices = flow.tensor(left_indices + x_indices + right_indices, device=img.device) + + y_indices = [i for i in range(in_sizes[-2])] + top_indices = [i for i in range(padding[2] - 1, -1, -1)] + bottom_indices = [-(i + 1) for i in range(padding[3])] + y_indices = flow.tensor(top_indices + y_indices + bottom_indices, device=img.device) + + ndim = img.ndim + if ndim == 3: + return img[:, y_indices[:, None], x_indices[None, :]] + elif ndim == 4: + return img[:, :, y_indices[:, None], x_indices[None, :]] + else: + raise RuntimeError("Symmetric padding of N-D tensors are not supported yet") + + +def pad( + img: Tensor, padding: List[int], fill: int = 0, padding_mode: str = "constant" +) -> Tensor: + _assert_image_tensor(img) + + if not isinstance(padding, (int, tuple, list)): + raise TypeError("Got inappropriate padding arg") + if not isinstance(fill, (int, float)): + raise TypeError("Got inappropriate fill arg") + if not isinstance(padding_mode, str): + raise TypeError("Got inappropriate padding_mode arg") + + if isinstance(padding, tuple): + padding = list(padding) + + if isinstance(padding, list) and len(padding) not in [1, 2, 4]: + raise ValueError( + "Padding must be an int or a 1, 2, or 4 element tuple, not a " + + "{} element tuple".format(len(padding)) + ) + + if padding_mode not in ["constant", "edge", "reflect", "symmetric"]: + raise ValueError( + "Padding mode should be either constant, edge, reflect or symmetric" + ) + + if isinstance(padding, int): + pad_left = pad_right = pad_top = pad_bottom = padding + elif len(padding) == 1: + pad_left = pad_right = pad_top = pad_bottom = padding[0] + elif len(padding) == 2: + pad_left = pad_right = padding[0] + pad_top = pad_bottom = padding[1] + else: + pad_left = padding[0] + pad_top = padding[1] + pad_right = padding[2] + pad_bottom = padding[3] + + p = [pad_left, pad_right, pad_top, pad_bottom] + + if padding_mode == "edge": + # remap padding_mode str + padding_mode = "replicate" + elif padding_mode == "symmetric": + # route to another implementation + return _pad_symmetric(img, p) + + need_squeeze = False + if img.ndim < 4: + img = img.unsqueeze(dim=0) + need_squeeze = True + + out_dtype = img.dtype + need_cast = False + if (padding_mode != "constant") and img.dtype not in (flow.float32, flow.float64): + # Here we temporary cast input tensor to float + # until pytorch issue is resolved : + # https://github.com/pytorch/pytorch/issues/40763 + need_cast = True + img = img.to(flow.float32) + img = flow.F.pad(img, pad=p, mode=padding_mode, value=float(fill)) + + if need_squeeze: + img = img.squeeze(dim=0) + + if need_cast: + img = img.to(out_dtype) + return img + + def resize(img: Tensor, size: List[int], interpolation: str = "bilinear") -> Tensor: _assert_image_tensor(img) @@ -121,7 +325,7 @@ def resize(img: Tensor, size: List[int], interpolation: str = "bilinear") -> Ten # Define align_corners to avoid warnings align_corners = False if interpolation in ["bilinear", "bicubic"] else None - img = flow.F.interpolate( + img = flow.nn.functional.interpolate( img, size=[size_h, size_w], mode=interpolation, align_corners=align_corners ) @@ -133,3 +337,48 @@ def resize(img: Tensor, size: List[int], interpolation: str = "bilinear") -> Ten ) return img + + +def _assert_grid_transform_inputs( + img: Tensor, + matrix: Optional[List[float]], + interpolation: str, + fill: Optional[List[float]], + supported_interpolation_modes: List[str], + coeffs: Optional[List[float]] = None, +): + + if not (isinstance(img, flow.Tensor)): + raise TypeError("Input img should be Tensor") + + _assert_image_tensor(img) + + if matrix is not None and not isinstance(matrix, list): + raise TypeError("Argument matrix should be a list") + + if matrix is not None and len(matrix) != 6: + raise ValueError("Argument matrix should have 6 float values") + + if coeffs is not None and len(coeffs) != 8: + raise ValueError("Argument coeffs should have 8 float values") + + if fill is not None and not isinstance(fill, (int, float, tuple, list)): + warnings.warn("Argument fill should be either int, float, tuple or list") + + # Check fill + num_channels = _get_image_num_channels(img) + if isinstance(fill, (tuple, list)) and ( + len(fill) > 1 and len(fill) != num_channels + ): + msg = ( + "The number of elements in 'fill' cannot broadcast to match the number of " + "channels of the image ({} != {})" + ) + raise ValueError(msg.format(len(fill), num_channels)) + + if interpolation not in supported_interpolation_modes: + raise ValueError( + "Interpolation mode '{}' is unsupported with Tensor input".format( + interpolation + ) + ) diff --git a/python/oneflow/utils/vision/transforms/transforms.py b/python/oneflow/utils/vision/transforms/transforms.py index 58d5d1644cc9b7e2c3022adc234b4b98fe4881b9..b2def901ab7b99f550a5dc65ad9710c4b3239f68 100644 --- a/python/oneflow/utils/vision/transforms/transforms.py +++ b/python/oneflow/utils/vision/transforms/transforms.py @@ -14,52 +14,21 @@ See the License for the specific language governing permissions and limitations under the License. """ import warnings +import numbers from collections.abc import Sequence +from typing import Tuple, List + +import numpy as np +import random +import math from . import functional as F from .functional import InterpolationMode, _interpolation_modes_from_int - - -from oneflow.nn.module import Module +import oneflow as flow +from oneflow.nn import Module from oneflow.framework.tensor import Tensor -class Normalize(Module): - """Normalize a tensor image with mean and standard deviation. - This transform does not support PIL Image. - Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n`` - channels, this transform will normalize each channel of the input - ``flow.*Tensor`` i.e., - ``output[channel] = (input[channel] - mean[channel]) / std[channel]`` - .. note:: - This transform acts out of place, i.e., it does not mutate the input tensor. - Args: - mean (sequence): Sequence of means for each channel. - std (sequence): Sequence of standard deviations for each channel. - inplace(bool,optional): Bool to make this operation in-place. - """ - - def __init__(self, mean, std, inplace=False): - super().__init__() - self.mean = mean - self.std = std - self.inplace = inplace - - def forward(self, tensor: Tensor) -> Tensor: - """ - Args: - tensor (Tensor): Tensor image to be normalized. - Returns: - Tensor: Normalized Tensor image. - """ - return F.normalize(tensor, self.mean, self.std, self.inplace) - - def __repr__(self): - return self.__class__.__name__ + "(mean={0}, std={1})".format( - self.mean, self.std - ) - - class Compose: """Composes several transforms together. Please, see the note below. @@ -98,16 +67,18 @@ class Compose: class ToTensor: - """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. + r"""Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. + Converts a PIL Image or numpy.ndarray (H x W x C) in the range [0, 255] to a flow.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1) or if the numpy.ndarray has dtype = np.uint8 In the other cases, tensors are returned without scaling. + .. note:: + Because the input image is scaled to [0.0, 1.0], this transformation should not be used when - transforming target image masks. See the `references`_ for implementing the transforms for image masks. - .. _references: https://github.com/pytorch/vision/tree/master/references/segmentation + transforming target image masks. """ def __call__(self, pic): @@ -123,8 +94,133 @@ class ToTensor: return self.__class__.__name__ + "()" +class PILToTensor: + """Convert a ``PIL Image`` to a tensor of the same type + + Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W). + """ + + def __call__(self, pic): + """ + Args: + pic (PIL Image): Image to be converted to tensor. + + Returns: + Tensor: Converted image. + """ + return F.pil_to_tensor(pic) + + def __repr__(self): + return self.__class__.__name__ + "()" + + +class ConvertImageDtype(Module): + """Convert a tensor image to the given ``dtype`` and scale the values accordingly + This function does not support PIL Image. + + Args: + dtype (flow.dtype): Desired data type of the output + + .. note:: + + When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly. + If converted back and forth, this mismatch has no effect. + + Raises: + RuntimeError: When trying to cast :class:`flow.float32` to :class:`flow.int32` or :class:`flow.int64` as + well as for trying to cast :class:`flow.float64` to :class:`flow.int64`. These conversions might lead to + overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range + of the integer ``dtype``. + """ + + def __init__(self, dtype: flow.dtype) -> None: + super().__init__() + self.dtype = dtype + + def forward(self, image): + return F.convert_image_dtype(image, self.dtype) + + +class ToPILImage: + """Convert a tensor or an ndarray to PIL Image. + + Converts a flow.Tensor of shape C x H x W or a numpy ndarray of shape + H x W x C to a PIL Image while preserving the value range. + + Args: + mode (`PIL.Image mode`_): color space and pixel depth of input data (optional). + If ``mode`` is ``None`` (default) there are some assumptions made about the input data: + - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``. + - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``. + - If the input has 2 channels, the ``mode`` is assumed to be ``LA``. + - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``, + ``short``). + + .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes + """ + + def __init__(self, mode=None): + self.mode = mode + + def __call__(self, pic): + """ + Args: + pic (Tensor or numpy.ndarray): Image to be converted to PIL Image. + + Returns: + PIL Image: Image converted to PIL Image. + + """ + return F.to_pil_image(pic, self.mode) + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + if self.mode is not None: + format_string += "mode={0}".format(self.mode) + format_string += ")" + return format_string + + +class Normalize(Module): + r"""Normalize a tensor image with mean and standard deviation. + This transform does not support PIL Image. + Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n`` + channels, this transform will normalize each channel of the input + ``flow.*Tensor`` i.e., + ``output[channel] = (input[channel] - mean[channel]) / std[channel]`` + + .. note:: + This transform acts out of place, i.e., it does not mutate the input tensor. + + Args: + mean (sequence): Sequence of means for each channel. + std (sequence): Sequence of standard deviations for each channel. + inplace(bool,optional): Bool to make this operation in-place. + """ + + def __init__(self, mean, std, inplace=False): + super().__init__() + self.mean = mean + self.std = std + self.inplace = inplace + + def forward(self, tensor: Tensor) -> Tensor: + """ + Args: + tensor (Tensor): Tensor image to be normalized. + Returns: + Tensor: Normalized Tensor image. + """ + return F.normalize(tensor, self.mean, self.std, self.inplace) + + def __repr__(self): + return self.__class__.__name__ + "(mean={0}, std={1})".format( + self.mean, self.std + ) + + class Resize(Module): - """Resize the input image to the given size. + r"""Resize the input image to the given size. If the image is oneflow Tensor, it is expected to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions @@ -175,3 +271,826 @@ class Resize(Module): return self.__class__.__name__ + "(size={0}, interpolation={1})".format( self.size, interpolate_str ) + + +class Scale(Resize): + r""" + Note: This transform is deprecated in favor of Resize. + """ + + def __init__(self, *args, **kwargs): + warnings.warn( + "The use of the transforms.Scale transform is deprecated, " + + "please use transforms.Resize instead." + ) + super(Scale, self).__init__(*args, **kwargs) + + +class CenterCrop(Module): + r"""Crops the given image at the center. + If the image is oneflow Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + If image size is smaller than output size along any edge, image is padded with 0 and then center cropped. + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + """ + + def __init__(self, size): + super().__init__() + self.size = _setup_size( + size, error_msg="Please provide only two dimensions (h, w) for size." + ) + + def forward(self, img): + """ + Args: + img (PIL Image or Tensor): Image to be cropped. + + Returns: + PIL Image or Tensor: Cropped image. + """ + return F.center_crop(img, self.size) + + def __repr__(self): + return self.__class__.__name__ + "(size={0})".format(self.size) + + +class Pad(Module): + r"""Pad the given image on all sides with the given "pad" value. + If the image is oneflow Tensor, it is expected + to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric, + at most 3 leading dimensions for mode edge, + and an arbitrary number of leading dimensions for mode constant + + Args: + padding (int or sequence): Padding on each border. If a single int is provided this + is used to pad all borders. If sequence of length 2 is provided this is the padding + on left/right and top/bottom respectively. If a sequence of length 4 is provided + this is the padding for the left, top, right and bottom borders respectively. + + fill (number or str or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of + length 3, it is used to fill R, G, B channels respectively. + This value is only used when the padding_mode is constant. + Only number is supported for oneflow Tensor. + Only int or str or tuple value is supported for PIL Image. + padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. + Default is constant. + + - constant: pads with a constant value, this value is specified with fill + + - edge: pads with the last value at the edge of the image. + If input a 5D oneflow Tensor, the last 3 dimensions will be padded instead of the last 2 + + - reflect: pads with reflection of image without repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode + will result in [3, 2, 1, 2, 3, 4, 3, 2] + + - symmetric: pads with reflection of image repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode + will result in [2, 1, 1, 2, 3, 4, 4, 3] + """ + + def __init__(self, padding, fill=0, padding_mode="constant"): + super().__init__() + if not isinstance(padding, (numbers.Number, tuple, list)): + raise TypeError("Got inappropriate padding arg") + + if not isinstance(fill, (numbers.Number, str, tuple)): + raise TypeError("Got inappropriate fill arg") + + if padding_mode not in ["constant", "edge", "reflect", "symmetric"]: + raise ValueError( + "Padding mode should be either constant, edge, reflect or symmetric" + ) + + if isinstance(padding, Sequence) and len(padding) not in [1, 2, 4]: + raise ValueError( + "Padding must be an int or a 1, 2, or 4 element tuple, not a " + + "{} element tuple".format(len(padding)) + ) + + self.padding = padding + self.fill = fill + self.padding_mode = padding_mode + + def forward(self, img): + """ + Args: + img (PIL Image or Tensor): Image to be padded. + + Returns: + PIL Image or Tensor: Padded image. + """ + return F.pad(img, self.padding, self.fill, self.padding_mode) + + def __repr__(self): + return ( + self.__class__.__name__ + + "(padding={0}, fill={1}, padding_mode={2})".format( + self.padding, self.fill, self.padding_mode + ) + ) + + +class Lambda: + r"""Apply a user-defined lambda as a transform. + + Args: + lambd (function): Lambda/function to be used for transform. + """ + + def __init__(self, lambd): + if not callable(lambd): + raise TypeError( + "Argument lambd should be callable, got {}".format( + repr(type(lambd).__name__) + ) + ) + self.lambd = lambd + + def __call__(self, img): + return self.lambd(img) + + def __repr__(self): + return self.__class__.__name__ + "()" + + +def _setup_size(size, error_msg): + if isinstance(size, numbers.Number): + return int(size), int(size) + + if isinstance(size, Sequence) and len(size) == 1: + return size[0], size[0] + + if len(size) != 2: + raise ValueError(error_msg) + + return size + + +class RandomTransforms: + r"""Base class for a list of transformations with randomness + + Args: + transforms (sequence): list of transformations + """ + + def __init__(self, transforms): + if not isinstance(transforms, Sequence): + raise TypeError("Argument transforms should be a sequence") + self.transforms = transforms + + def __call__(self, *args, **kwargs): + raise NotImplementedError() + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + for t in self.transforms: + format_string += "\n" + format_string += " {0}".format(t) + format_string += "\n)" + return format_string + + +class RandomApply(flow.nn.Module): + """Apply randomly a list of transformations with a given probability. + + .. note:: + In order to script the transformation, please use ``flow.nn.ModuleList`` as input instead of list/tuple of + transforms as shown below: + + >>> transforms = transforms.RandomApply(flow.nn.ModuleList([ + >>> transforms.ColorJitter(), + >>> ]), p=0.3) + + + Make sure to use only scriptable transformations, i.e. that work with ``flow.Tensor``, does not require + `lambda` functions or ``PIL.Image``. + + Args: + transforms (sequence or flow.nn.Module): list of transformations + p (float): probability + """ + + def __init__(self, transforms, p=0.5): + super().__init__() + self.transforms = transforms + self.p = p + + def forward(self, img): + # TODO:replace with flow.rand(1) + if self.p < np.random.rand(1): + return img + for t in self.transforms: + img = t(img) + return img + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + format_string += "\n p={}".format(self.p) + for t in self.transforms: + format_string += "\n" + format_string += " {0}".format(t) + format_string += "\n)" + return format_string + + +class RandomOrder(RandomTransforms): + """Apply a list of transformations in a random order. + """ + + def __call__(self, img): + order = list(range(len(self.transforms))) + random.shuffle(order) + for i in order: + img = self.transforms[i](img) + return img + + +class RandomChoice(RandomTransforms): + """Apply single transformation randomly picked from a list. + """ + + def __call__(self, img): + t = random.choice(self.transforms) + return t(img) + + +class RandomCrop(flow.nn.Module): + """Crop the given image at a random location. + If the image is oneflow Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions, + but if non-constant padding is used, the input is expected to have at most 2 leading dimensions + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + padding (int or sequence, optional): Optional padding on each border + of the image. Default is None. If a single int is provided this + is used to pad all borders. If sequence of length 2 is provided this is the padding + on left/right and top/bottom respectively. If a sequence of length 4 is provided + this is the padding for the left, top, right and bottom borders respectively. + + pad_if_needed (boolean): It will pad the image if smaller than the + desired size to avoid raising an exception. Since cropping is done + after padding, the padding seems to be done at a random offset. + fill (number or str or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of + length 3, it is used to fill R, G, B channels respectively. + This value is only used when the padding_mode is constant. + Only number is supported for flow Tensor. + Only int or str or tuple value is supported for PIL Image. + padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. + Default is constant. + + - constant: pads with a constant value, this value is specified with fill + + - edge: pads with the last value at the edge of the image. + If input a 5D flow Tensor, the last 3 dimensions will be padded instead of the last 2 + + - reflect: pads with reflection of image without repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode + will result in [3, 2, 1, 2, 3, 4, 3, 2] + + - symmetric: pads with reflection of image repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode + will result in [2, 1, 1, 2, 3, 4, 4, 3] + """ + + @staticmethod + def get_params( + img: Tensor, output_size: Tuple[int, int] + ) -> Tuple[int, int, int, int]: + """Get parameters for ``crop`` for a random crop. + + Args: + img (PIL Image or Tensor): Image to be cropped. + output_size (tuple): Expected output size of the crop. + + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for random crop. + """ + w, h = F._get_image_size(img) + th, tw = output_size + + if h + 1 < th or w + 1 < tw: + raise ValueError( + "Required crop size {} is larger then input image size {}".format( + (th, tw), (h, w) + ) + ) + + if w == tw and h == th: + return 0, 0, h, w + + # TODO:replace with flow.randint + # i = flow.randint(0, h - th + 1, size=(1, )).item() + # j = flow.randint(0, w - tw + 1, size=(1, )).item() + i = np.random.randint(low=0, high=h - th + 1, size=(1,), dtype=np.int32) + j = np.random.randint(low=0, high=w - tw + 1, size=(1,), dtype=np.int32) + return i, j, th, tw + + def __init__( + self, size, padding=None, pad_if_needed=False, fill=0, padding_mode="constant" + ): + super().__init__() + + self.size = tuple( + _setup_size( + size, error_msg="Please provide only two dimensions (h, w) for size." + ) + ) + + self.padding = padding + self.pad_if_needed = pad_if_needed + self.fill = fill + self.padding_mode = padding_mode + + def forward(self, img): + """ + Args: + img (PIL Image or Tensor): Image to be cropped. + + Returns: + PIL Image or Tensor: Cropped image. + """ + if self.padding is not None: + img = F.pad(img, self.padding, self.fill, self.padding_mode) + + width, height = F._get_image_size(img) + # pad the width if needed + if self.pad_if_needed and width < self.size[1]: + padding = [self.size[1] - width, 0] + img = F.pad(img, padding, self.fill, self.padding_mode) + # pad the height if needed + if self.pad_if_needed and height < self.size[0]: + padding = [0, self.size[0] - height] + img = F.pad(img, padding, self.fill, self.padding_mode) + + i, j, h, w = self.get_params(img, self.size) + + return F.crop(img, i, j, h, w) + + def __repr__(self): + return self.__class__.__name__ + "(size={0}, padding={1})".format( + self.size, self.padding + ) + + +class RandomHorizontalFlip(flow.nn.Module): + """Horizontally flip the given image randomly with a given probability. + If the image is flow Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading + dimensions + + Args: + p (float): probability of the image being flipped. Default value is 0.5 + """ + + def __init__(self, p=0.5): + super().__init__() + self.p = p + + def forward(self, img): + """ + Args: + img (PIL Image or Tensor): Image to be flipped. + + Returns: + PIL Image or Tensor: Randomly flipped image. + """ + # TODO: replace with flow.rand(1): + if np.random.rand(1) < self.p: + return F.hflip(img) + return img + + def __repr__(self): + return self.__class__.__name__ + "(p={})".format(self.p) + + +class RandomVerticalFlip(flow.nn.Module): + """Vertically flip the given image randomly with a given probability. + If the image is flow Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading + dimensions + + Args: + p (float): probability of the image being flipped. Default value is 0.5 + """ + + def __init__(self, p=0.5): + super().__init__() + self.p = p + + def forward(self, img): + """ + Args: + img (PIL Image or Tensor): Image to be flipped. + + Returns: + PIL Image or Tensor: Randomly flipped image. + """ + # TODO:replace with flow.rand(1) + if np.random.rand(1) < self.p: + return F.vflip(img) + return img + + def __repr__(self): + return self.__class__.__name__ + "(p={})".format(self.p) + + +class RandomResizedCrop(flow.nn.Module): + """Crop a random portion of image and resize it to a given size. + + If the image is flow Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions + + A crop of the original image is made: the crop has a random area (H * W) + and a random aspect ratio. This crop is finally resized to the given + size. This is popularly used to train the Inception networks. + + Args: + size (int or sequence): expected output size of the crop, for each edge. If size is an + int instead of sequence like (h, w), a square output size ``(size, size)`` is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop, + before resizing. The scale is defined with respect to the area of the original image. + ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before + resizing. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`flow.utils.vision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` and + ``InterpolationMode.BICUBIC`` are supported. + For backward compatibility integer values (e.g. ``PIL.Image.NEAREST``) are still acceptable. + + """ + + def __init__( + self, + size, + scale=(0.08, 1.0), + ratio=(3.0 / 4.0, 4.0 / 3.0), + interpolation=InterpolationMode.BILINEAR, + ): + super().__init__() + self.size = _setup_size( + size, error_msg="Please provide only two dimensions (h, w) for size." + ) + + if not isinstance(scale, Sequence): + raise TypeError("Scale should be a sequence") + if not isinstance(ratio, Sequence): + raise TypeError("Ratio should be a sequence") + if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): + warnings.warn("Scale and ratio should be of kind (min, max)") + + # Backward compatibility with integer value + if isinstance(interpolation, int): + warnings.warn( + "Argument interpolation should be of type InterpolationMode instead of int. " + "Please, use InterpolationMode enum." + ) + interpolation = _interpolation_modes_from_int(interpolation) + + self.interpolation = interpolation + self.scale = scale + self.ratio = ratio + + @staticmethod + def get_params( + img: Tensor, scale: List[float], ratio: List[float] + ) -> Tuple[int, int, int, int]: + """Get parameters for ``crop`` for a random sized crop. + + Args: + img (PIL Image or Tensor): Input image. + scale (list): range of scale of the origin size cropped + ratio (list): range of aspect ratio of the origin aspect ratio cropped + + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for a random + sized crop. + """ + width, height = F._get_image_size(img) + area = height * width + + log_ratio = flow.log(flow.tensor(ratio)) + for _ in range(10): + target_area = area * flow.empty(1).uniform_(scale[0], scale[1]).item() + aspect_ratio = flow.exp( + flow.empty(1).uniform_(log_ratio[0], log_ratio[1]) + ).item() + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + if 0 < w <= width and 0 < h <= height: + # TODO:replace with flow.randint + # i = flow.randint(0, height - h + 1, size=(1,)).item() + # j = flow.randint(0, width - w + 1, size=(1,)).item() + i = np.random.randint( + low=0, high=height - h + 1, size=(1,), dtype=np.int32 + ) + j = np.random.randint( + low=0, high=width - w + 1, size=(1,), dtype=np.int32 + ) + return i, j, h, w + + # Fallback to central crop + in_ratio = float(width) / float(height) + if in_ratio < min(ratio): + w = width + h = int(round(w / min(ratio))) + elif in_ratio > max(ratio): + h = height + w = int(round(h * max(ratio))) + else: # whole image + w = width + h = height + i = (height - h) // 2 + j = (width - w) // 2 + return i, j, h, w + + def forward(self, img): + """ + Args: + img (PIL Image or Tensor): Image to be cropped and resized. + + Returns: + PIL Image or Tensor: Randomly cropped and resized image. + """ + i, j, h, w = self.get_params(img, self.scale, self.ratio) + return F.resized_crop(img, i, j, h, w, self.size, self.interpolation) + + def __repr__(self): + interpolate_str = self.interpolation.value + format_string = self.__class__.__name__ + "(size={0}".format(self.size) + format_string += ", scale={0}".format(tuple(round(s, 4) for s in self.scale)) + format_string += ", ratio={0}".format(tuple(round(r, 4) for r in self.ratio)) + format_string += ", interpolation={0})".format(interpolate_str) + return format_string + + +class RandomSizedCrop(RandomResizedCrop): + """ + Note: This transform is deprecated in favor of RandomResizedCrop. + """ + + def __init__(self, *args, **kwargs): + warnings.warn( + "The use of the transforms.RandomSizedCrop transform is deprecated, " + + "please use transforms.RandomResizedCrop instead." + ) + super(RandomSizedCrop, self).__init__(*args, **kwargs) + + +class FiveCrop(flow.nn.Module): + """Crop the given image into four corners and the central crop. + If the image is flow Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading + dimensions + + .. Note:: + This transform returns a tuple of images and there may be a mismatch in the number of + inputs and targets your Dataset returns. See below for an example of how to deal with + this. + + Args: + size (sequence or int): Desired output size of the crop. If size is an ``int`` + instead of sequence like (h, w), a square crop of size (size, size) is made. + If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + + Example: + >>> transform = Compose([ + >>> FiveCrop(size), # this is a list of PIL Images + >>> Lambda(lambda crops: flow.stack([ToTensor()(crop) for crop in crops])) # returns a 4D tensor + >>> ]) + >>> #In your test loop you can do the following: + >>> input, target = batch # input is a 5d tensor, target is 2d + >>> bs, ncrops, c, h, w = input.size() + >>> result = model(input.view(-1, c, h, w)) # fuse batch size and ncrops + >>> result_avg = result.view(bs, ncrops, -1).mean(1) # avg over crops + """ + + def __init__(self, size): + super().__init__() + self.size = _setup_size( + size, error_msg="Please provide only two dimensions (h, w) for size." + ) + + def forward(self, img): + """ + Args: + img (PIL Image or Tensor): Image to be cropped. + + Returns: + tuple of 5 images. Image can be PIL Image or Tensor + """ + return F.five_crop(img, self.size) + + def __repr__(self): + return self.__class__.__name__ + "(size={0})".format(self.size) + + +class TenCrop(flow.nn.Module): + """Crop the given image into four corners and the central crop plus the flipped version of + these (horizontal flipping is used by default). + If the image is flow Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading + dimensions + + .. Note:: + This transform returns a tuple of images and there may be a mismatch in the number of + inputs and targets your Dataset returns. See below for an example of how to deal with + this. + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + vertical_flip (bool): Use vertical flipping instead of horizontal + + Example: + >>> transform = Compose([ + >>> TenCrop(size), # this is a list of PIL Images + >>> Lambda(lambda crops: flow.stack([ToTensor()(crop) for crop in crops])) # returns a 4D tensor + >>> ]) + >>> #In your test loop you can do the following: + >>> input, target = batch # input is a 5d tensor, target is 2d + >>> bs, ncrops, c, h, w = input.size() + >>> result = model(input.view(-1, c, h, w)) # fuse batch size and ncrops + >>> result_avg = result.view(bs, ncrops, -1).mean(1) # avg over crops + """ + + def __init__(self, size, vertical_flip=False): + super().__init__() + self.size = _setup_size( + size, error_msg="Please provide only two dimensions (h, w) for size." + ) + self.vertical_flip = vertical_flip + + def forward(self, img): + """ + Args: + img (PIL Image or Tensor): Image to be cropped. + + Returns: + tuple of 10 images. Image can be PIL Image or Tensor + """ + return F.ten_crop(img, self.size, self.vertical_flip) + + def __repr__(self): + return self.__class__.__name__ + "(size={0}, vertical_flip={1})".format( + self.size, self.vertical_flip + ) + + +class RandomRotation(flow.nn.Module): + """Rotate the image by angle. + If the image is flow Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + degrees (sequence or number): Range of degrees to select from. + If degrees is a number instead of sequence like (min, max), the range of degrees + will be (-degrees, +degrees). + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`flow.utils.vision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + For backward compatibility integer values (e.g. ``PIL.Image.NEAREST``) are still acceptable. + expand (bool, optional): Optional expansion flag. + If true, expands the output to make it large enough to hold the entire rotated image. + If false or omitted, make the output image the same size as the input image. + Note that the expand flag assumes rotation around the center and no translation. + center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner. + Default is the center of the image. + fill (sequence or number): Pixel fill value for the area outside the rotated + image. Default is ``0``. If given a number, the value is used for all bands respectively. + resample (int, optional): deprecated argument and will be removed since v0.10.0. + Please use the ``interpolation`` parameter instead. + + .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters + + """ + + def __init__( + self, + degrees, + interpolation=InterpolationMode.NEAREST, + expand=False, + center=None, + fill=0, + resample=None, + ): + super().__init__() + if resample is not None: + warnings.warn( + "Argument resample is deprecated and will be removed since v0.10.0. Please, use interpolation instead" + ) + interpolation = _interpolation_modes_from_int(resample) + + # Backward compatibility with integer value + if isinstance(interpolation, int): + warnings.warn( + "Argument interpolation should be of type InterpolationMode instead of int. " + "Please, use InterpolationMode enum." + ) + interpolation = _interpolation_modes_from_int(interpolation) + + self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,)) + + if center is not None: + _check_sequence_input(center, "center", req_sizes=(2,)) + + self.center = center + + self.resample = self.interpolation = interpolation + self.expand = expand + + if fill is None: + fill = 0 + elif not isinstance(fill, (Sequence, numbers.Number)): + raise TypeError("Fill should be either a sequence or a number.") + + self.fill = fill + + @staticmethod + def get_params(degrees: List[float]) -> float: + """Get parameters for ``rotate`` for a random rotation. + + Returns: + float: angle parameter to be passed to ``rotate`` for random rotation. + """ + angle = float( + flow.empty(1).uniform_(float(degrees[0]), float(degrees[1])).item() + ) + return angle + + def forward(self, img): + """ + Args: + img (PIL Image or Tensor): Image to be rotated. + + Returns: + PIL Image or Tensor: Rotated image. + """ + fill = self.fill + if isinstance(img, Tensor): + if isinstance(fill, (int, float)): + fill = [float(fill)] * F._get_image_num_channels(img) + else: + fill = [float(f) for f in fill] + angle = self.get_params(self.degrees) + + return F.rotate(img, angle, self.resample, self.expand, self.center, fill) + + def __repr__(self): + interpolate_str = self.interpolation.value + format_string = self.__class__.__name__ + "(degrees={0}".format(self.degrees) + format_string += ", interpolation={0}".format(interpolate_str) + format_string += ", expand={0}".format(self.expand) + if self.center is not None: + format_string += ", center={0}".format(self.center) + if self.fill is not None: + format_string += ", fill={0}".format(self.fill) + format_string += ")" + return format_string + + +def _setup_size(size, error_msg): + if isinstance(size, numbers.Number): + return int(size), int(size) + + if isinstance(size, Sequence) and len(size) == 1: + return size[0], size[0] + + if len(size) != 2: + raise ValueError(error_msg) + + return size + + +def _check_sequence_input(x, name, req_sizes): + msg = ( + req_sizes[0] if len(req_sizes) < 2 else " or ".join([str(s) for s in req_sizes]) + ) + if not isinstance(x, Sequence): + raise TypeError("{} should be a sequence of length {}.".format(name, msg)) + if len(x) not in req_sizes: + raise ValueError("{} should be sequence of length {}.".format(name, msg)) + + +def _setup_angle(x, name, req_sizes=(2,)): + if isinstance(x, numbers.Number): + if x < 0: + raise ValueError( + "If {} is a single number, it must be positive.".format(name) + ) + x = [-x, x] + else: + _check_sequence_input(x, name, req_sizes) + + return [float(d) for d in x]