transforms.py 74.3 KB
Newer Older
L
LielinJiang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
16
import numbers
L
LielinJiang 已提交
17
import random
18 19
import traceback
from collections.abc import Iterable, Sequence
L
LielinJiang 已提交
20 21 22

import numpy as np

23
import paddle
L
LielinJiang 已提交
24

25
from . import functional as F
L
LielinJiang 已提交
26

27
__all__ = []
L
LielinJiang 已提交
28 29


30 31 32 33 34
def _get_image_size(img):
    if F._is_pil_image(img):
        return img.size
    elif F._is_numpy_image(img):
        return img.shape[:2][::-1]
35
    elif F._is_tensor_image(img):
36 37 38 39 40 41
        if len(img.shape) == 3:
            return img.shape[1:][::-1]  # chw -> wh
        elif len(img.shape) == 4:
            return img.shape[2:][::-1]  # nchw -> wh
        else:
            raise ValueError(
42 43 44 45
                "The dim for input Tensor should be 3-D or 4-D, but received {}".format(
                    len(img.shape)
                )
            )
46
    else:
47
        raise TypeError(f"Unexpected type {type(img)}")
48 49


50 51 52
def _check_input(
    value, name, center=1, bound=(0, float('inf')), clip_first_on_zero=True
):
53 54 55 56
    if isinstance(value, numbers.Number):
        if value < 0:
            raise ValueError(
                "If {} is a single number, it must be non negative.".format(
57 58 59
                    name
                )
            )
60 61 62 63 64
        value = [center - value, center + value]
        if clip_first_on_zero:
            value[0] = max(value[0], 0)
    elif isinstance(value, (tuple, list)) and len(value) == 2:
        if not bound[0] <= value[0] <= value[1] <= bound[1]:
65
            raise ValueError(f"{name} values should be between {bound}")
66 67
    else:
        raise TypeError(
68 69 70 71
            "{} should be a single number or a list/tuple with lenght 2.".format(
                name
            )
        )
72 73 74 75 76 77

    if value[0] == value[1] == center:
        value = None
    return value


78
class Compose:
L
LielinJiang 已提交
79 80 81 82 83
    """
    Composes several transforms together use for composing list of transforms
    together for a dataset transform.

    Args:
84
        transforms (list|tuple): List/Tuple of transforms to compose.
L
LielinJiang 已提交
85 86 87 88 89 90

    Returns:
        A compose object which is callable, __call__ for this Compose
        object will call each given :attr:`transforms` sequencely.

    Examples:
91

L
LielinJiang 已提交
92 93
        .. code-block:: python

94 95 96 97 98 99 100 101 102 103
            >>> from paddle.vision.datasets import Flowers
            >>> from paddle.vision.transforms import Compose, ColorJitter, Resize
            >>> transform = Compose([ColorJitter(), Resize(size=608)])
            >>> flowers = Flowers(mode='test', transform=transform)
            >>> for i in range(3):
            ...     sample = flowers[i]
            ...     print(sample[0].size, sample[1])
            (916, 608) [1]
            (758, 608) [1]
            (811, 608) [1]
L
LielinJiang 已提交
104 105 106 107 108
    """

    def __init__(self, transforms):
        self.transforms = transforms

109
    def __call__(self, data):
L
LielinJiang 已提交
110 111
        for f in self.transforms:
            try:
112
                data = f(data)
L
LielinJiang 已提交
113 114
            except Exception as e:
                stack_info = traceback.format_exc()
115 116 117 118
                print(
                    "fail to perform transform [{}] with error: "
                    "{} and stack:\n{}".format(f, e, str(stack_info))
                )
L
LielinJiang 已提交
119 120 121 122 123 124 125
                raise e
        return data

    def __repr__(self):
        format_string = self.__class__.__name__ + '('
        for t in self.transforms:
            format_string += '\n'
126
            format_string += f'    {t}'
L
LielinJiang 已提交
127 128 129 130
        format_string += '\n)'
        return format_string


131
class BaseTransform:
132 133
    """
    Base class of all transforms used in computer vision.
L
LielinJiang 已提交
134

135
    calling logic:
136

I
Infinity_lee 已提交
137 138
    .. code-block:: text

139 140 141
        if keys is None:
            _get_params -> _apply_image()
        else:
142
            _get_params -> _apply_*() for * in keys
143 144 145

    If you want to implement a self-defined transform method for image,
    rewrite _apply_* method in subclass.
L
LielinJiang 已提交
146

147 148 149 150
    Args:
        keys (list[str]|tuple[str], optional): Input type. Input is a tuple contains different structures,
            key is used to specify the type of input. For example, if your input
            is image type, then the key can be None or ("image"). if your input
151
            is (image, image) type, then the keys should be ("image", "image").
152 153 154 155
            if your input is (image, boxes), then the keys should be ("image", "boxes").

            Current available strings & data type are describe below:

I
Infinity_lee 已提交
156 157 158 159 160
                - "image": input image, with shape of (H, W, C)
                - "coords": coordinates, with shape of (N, 2)
                - "boxes": bounding boxes, with shape of (N, 4), "xyxy" format,the 1st "xy" represents
                  top left point of a box,the 2nd "xy" represents right bottom point.
                - "mask": map used for segmentation, with shape of (H, W, 1)
161

162 163
            You can also customize your data types only if you implement the corresponding
            _apply_*() methods, otherwise ``NotImplementedError`` will be raised.
164

L
LielinJiang 已提交
165
    Examples:
166

L
LielinJiang 已提交
167 168
        .. code-block:: python

169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
            >>> import numpy as np
            >>> from PIL import Image
            >>> import paddle.vision.transforms.functional as F
            >>> from paddle.vision.transforms import BaseTransform

            >>> def _get_image_size(img):
            ...     if F._is_pil_image(img):
            ...         return img.size
            ...     elif F._is_numpy_image(img):
            ...         return img.shape[:2][::-1]
            ...     else:
            ...         raise TypeError("Unexpected type {}".format(type(img)))
            ...
            >>> class CustomRandomFlip(BaseTransform):
            ...     def __init__(self, prob=0.5, keys=None):
            ...         super().__init__(keys)
            ...         self.prob = prob
            ...
            ...     def _get_params(self, inputs):
            ...         image = inputs[self.keys.index('image')]
            ...         params = {}
            ...         params['flip'] = np.random.random() < self.prob
            ...         params['size'] = _get_image_size(image)
            ...         return params
            ...
            ...     def _apply_image(self, image):
            ...         if self.params['flip']:
            ...             return F.hflip(image)
            ...         return image
            ...
            ...     # if you only want to transform image, do not need to rewrite this function
            ...     def _apply_coords(self, coords):
            ...         if self.params['flip']:
            ...             w = self.params['size'][0]
            ...             coords[:, 0] = w - coords[:, 0]
            ...         return coords
            ...
            ...     # if you only want to transform image, do not need to rewrite this function
            ...     def _apply_boxes(self, boxes):
            ...         idxs = np.array([(0, 1), (2, 1), (0, 3), (2, 3)]).flatten()
            ...         coords = np.asarray(boxes).reshape(-1, 4)[:, idxs].reshape(-1, 2)
            ...         coords = self._apply_coords(coords).reshape((-1, 4, 2))
            ...         minxy = coords.min(axis=1)
            ...         maxxy = coords.max(axis=1)
            ...         trans_boxes = np.concatenate((minxy, maxxy), axis=1)
            ...         return trans_boxes
            ...
            ...     # if you only want to transform image, do not need to rewrite this function
            ...     def _apply_mask(self, mask):
            ...         if self.params['flip']:
            ...             return F.hflip(mask)
            ...         return mask
            ...
            >>> # create fake inputs
            >>> fake_img = Image.fromarray((np.random.rand(400, 500, 3) * 255.).astype('uint8'))
            >>> fake_boxes = np.array([[2, 3, 200, 300], [50, 60, 80, 100]])
            >>> fake_mask = fake_img.convert('L')
            >>> # only transform for image:
            >>> flip_transform = CustomRandomFlip(1.0)
            >>> converted_img = flip_transform(fake_img)
            >>> # transform for image, boxes and mask
            >>> flip_transform = CustomRandomFlip(1.0, keys=('image', 'boxes', 'mask'))
            >>> (converted_img, converted_boxes, converted_mask) = flip_transform((fake_img, fake_boxes, fake_mask))
            >>> converted_boxes
            array([[300,   3, 498, 300],
                   [420,  60, 450, 100]])
L
LielinJiang 已提交
235 236 237

    """

238 239
    def __init__(self, keys=None):
        if keys is None:
240
            keys = ("image",)
241
        elif not isinstance(keys, Sequence):
242
            raise ValueError(f"keys should be a sequence, but got keys={keys}")
243 244
        for k in keys:
            if self._get_apply(k) is None:
245
                raise NotImplementedError(f"{k} is unsupported data structure")
246 247 248 249 250 251 252 253 254 255 256
        self.keys = keys

        # storage some params get from function get_params()
        self.params = None

    def _get_params(self, inputs):
        pass

    def __call__(self, inputs):
        """Apply transform on single input data"""
        if not isinstance(inputs, tuple):
257
            inputs = (inputs,)
258 259 260 261 262 263 264 265 266 267 268

        self.params = self._get_params(inputs)

        outputs = []
        for i in range(min(len(inputs), len(self.keys))):
            apply_func = self._get_apply(self.keys[i])
            if apply_func is None:
                outputs.append(inputs[i])
            else:
                outputs.append(apply_func(inputs[i]))
        if len(inputs) > len(self.keys):
269
            outputs.extend(inputs[len(self.keys) :])
270 271 272 273 274 275

        if len(outputs) == 1:
            outputs = outputs[0]
        else:
            outputs = tuple(outputs)
        return outputs
L
LielinJiang 已提交
276

277
    def _get_apply(self, key):
278
        return getattr(self, f"_apply_{key}", None)
L
LielinJiang 已提交
279

280 281
    def _apply_image(self, image):
        raise NotImplementedError
L
LielinJiang 已提交
282

283 284
    def _apply_boxes(self, boxes):
        raise NotImplementedError
L
LielinJiang 已提交
285

286 287
    def _apply_mask(self, mask):
        raise NotImplementedError
L
LielinJiang 已提交
288

289 290 291 292

class ToTensor(BaseTransform):
    """Convert a ``PIL.Image`` or ``numpy.ndarray`` to ``paddle.Tensor``.

L
LielinJiang 已提交
293 294
    Converts a PIL.Image or numpy.ndarray (H x W x C) to a paddle.Tensor of shape (C x H x W).

295
    If input is a grayscale image (H x W), it will be converted to an image of shape (H x W x 1).
L
LielinJiang 已提交
296 297 298 299
    And the shape of output tensor will be (1 x H x W).

    If you want to keep the shape of output tensor as (H x W x C), you can set data_format = ``HWC`` .

300 301 302
    Converts a PIL.Image or numpy.ndarray in the range [0, 255] to a paddle.Tensor in the
    range [0.0, 1.0] if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr,
    RGBA, CMYK, 1) or if the numpy.ndarray has dtype = np.uint8.
303 304 305 306

    In the other cases, tensors are returned without scaling.

    Args:
307
        data_format (str, optional): Data format of output tensor, should be 'HWC' or
308 309
            'CHW'. Default: 'CHW'.
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
310

311 312 313 314 315 316 317
    Shape:
        - img(PIL.Image|np.ndarray): The input image with shape (H x W x C).
        - output(np.ndarray): A tensor with shape (C x H x W) or (H x W x C) according option data_format.

    Returns:
        A callable object of ToTensor.

318
    Examples:
319

320 321
        .. code-block:: python

322 323 324 325 326 327 328 329 330 331 332 333
            >>> import numpy as np
            >>> from PIL import Image
            >>> import paddle.vision.transforms as T
            >>> import paddle.vision.transforms.functional as F

            >>> fake_img = Image.fromarray((np.random.rand(4, 5, 3) * 255.).astype(np.uint8))
            >>> transform = T.ToTensor()
            >>> tensor = transform(fake_img)
            >>> print(tensor.shape)
            [3, 4, 5]
            >>> print(tensor.dtype)
            paddle.float32
334 335 336
    """

    def __init__(self, data_format='CHW', keys=None):
337
        super().__init__(keys)
338 339 340 341 342 343 344 345 346 347 348 349 350 351
        self.data_format = data_format

    def _apply_image(self, img):
        """
        Args:
            img (PIL.Image|np.ndarray): Image to be converted to tensor.

        Returns:
            Tensor: Converted image.
        """
        return F.to_tensor(img, self.data_format)


class Resize(BaseTransform):
L
LielinJiang 已提交
352 353 354 355 356 357 358 359
    """Resize the input Image to the given size.

    Args:
        size (int|list|tuple): Desired output size. If size is a sequence like
            (h, w), output size will be matched to this. If size is an int,
            smaller edge of the image will be matched to this number.
            i.e, if height > width, then image will be rescaled to
            (size * height / width, size)
360 361 362 363 364 365 366
        interpolation (int|str, optional): Interpolation method. Default: 'bilinear'.
            when use pil backend, support method are as following:
            - "nearest": Image.NEAREST,
            - "bilinear": Image.BILINEAR,
            - "bicubic": Image.BICUBIC,
            - "box": Image.BOX,
            - "lanczos": Image.LANCZOS,
367
            - "hamming": Image.HAMMING
368 369 370 371 372
            when use cv2 backend, support method are as following:
            - "nearest": cv2.INTER_NEAREST,
            - "bilinear": cv2.INTER_LINEAR,
            - "area": cv2.INTER_AREA,
            - "bicubic": cv2.INTER_CUBIC,
373 374
            - "lanczos": cv2.INTER_LANCZOS4
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
L
LielinJiang 已提交
375

376 377 378 379 380 381 382
    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): A resized image.

    Returns:
        A callable object of Resize.

L
LielinJiang 已提交
383
    Examples:
384

L
LielinJiang 已提交
385 386
        .. code-block:: python

387 388 389 390 391 392 393 394 395 396 397 398 399
            >>> import numpy as np
            >>> from PIL import Image
            >>> from paddle.vision.transforms import Resize

            >>> fake_img = Image.fromarray((np.random.rand(256, 300, 3) * 255.).astype(np.uint8))
            >>> transform = Resize(size=224)
            >>> converted_img = transform(fake_img)
            >>> print(converted_img.size)
            (262, 224)
            >>> transform = Resize(size=(200,150))
            >>> converted_img = transform(fake_img)
            >>> print(converted_img.size)
            (150, 200)
L
LielinJiang 已提交
400 401
    """

402
    def __init__(self, size, interpolation='bilinear', keys=None):
403
        super().__init__(keys)
404 405 406
        assert isinstance(size, int) or (
            isinstance(size, Iterable) and len(size) == 2
        )
L
LielinJiang 已提交
407 408 409
        self.size = size
        self.interpolation = interpolation

410
    def _apply_image(self, img):
L
LielinJiang 已提交
411 412 413
        return F.resize(img, self.size, self.interpolation)


414
class RandomResizedCrop(BaseTransform):
L
LielinJiang 已提交
415 416 417 418 419 420
    """Crop the input data to random size and aspect ratio.
    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
    aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
    After applying crop transfrom, the input data will be resized to given size.

    Args:
421
        size (int|list|tuple): Target size of output image, with (height, width) shape.
I
Infinity_lee 已提交
422 423 424
        scale (list|tuple, optional): Scale range of the cropped image before resizing, relatively to the origin
            image. Default: (0.08, 1.0).
        ratio (list|tuple, optional): Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
425 426 427 428 429 430 431
        interpolation (int|str, optional): Interpolation method. Default: 'bilinear'. when use pil backend,
            support method are as following:
            - "nearest": Image.NEAREST,
            - "bilinear": Image.BILINEAR,
            - "bicubic": Image.BICUBIC,
            - "box": Image.BOX,
            - "lanczos": Image.LANCZOS,
432
            - "hamming": Image.HAMMING
433 434 435 436 437
            when use cv2 backend, support method are as following:
            - "nearest": cv2.INTER_NEAREST,
            - "bilinear": cv2.INTER_LINEAR,
            - "area": cv2.INTER_AREA,
            - "bicubic": cv2.INTER_CUBIC,
438 439
            - "lanczos": cv2.INTER_LANCZOS4
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
L
LielinJiang 已提交
440

441 442 443 444 445 446 447
    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): A cropped image.

    Returns:
        A callable object of RandomResizedCrop.

L
LielinJiang 已提交
448
    Examples:
449

L
LielinJiang 已提交
450 451
        .. code-block:: python

452 453 454
            >>> import numpy as np
            >>> from PIL import Image
            >>> from paddle.vision.transforms import RandomResizedCrop
L
LielinJiang 已提交
455

456 457 458 459 460
            >>> transform = RandomResizedCrop(224)
            >>> fake_img = Image.fromarray((np.random.rand(300, 320, 3) * 255.).astype(np.uint8))
            >>> fake_img = transform(fake_img)
            >>> print(fake_img.size)
            (224, 224)
461

L
LielinJiang 已提交
462 463
    """

464 465 466 467 468 469 470 471
    def __init__(
        self,
        size,
        scale=(0.08, 1.0),
        ratio=(3.0 / 4, 4.0 / 3),
        interpolation='bilinear',
        keys=None,
    ):
472
        super().__init__(keys)
473 474
        if isinstance(size, int):
            self.size = (size, size)
L
LielinJiang 已提交
475
        else:
476
            self.size = size
477 478
        assert scale[0] <= scale[1], "scale should be of kind (min, max)"
        assert ratio[0] <= ratio[1], "ratio should be of kind (min, max)"
L
LielinJiang 已提交
479 480 481 482
        self.scale = scale
        self.ratio = ratio
        self.interpolation = interpolation

483
    def _dynamic_get_param(self, image, attempts=10):
484
        width, height = _get_image_size(image)
L
LielinJiang 已提交
485 486 487 488 489 490 491 492 493 494 495
        area = height * width

        for _ in range(attempts):
            target_area = np.random.uniform(*self.scale) * area
            log_ratio = tuple(math.log(x) for x in self.ratio)
            aspect_ratio = math.exp(np.random.uniform(*log_ratio))

            w = int(round(math.sqrt(target_area * aspect_ratio)))
            h = int(round(math.sqrt(target_area / aspect_ratio)))

            if 0 < w <= width and 0 < h <= height:
496 497 498
                i = random.randint(0, height - h)
                j = random.randint(0, width - w)
                return i, j, h, w
L
LielinJiang 已提交
499 500 501 502 503 504 505 506 507

        # Fallback to central crop
        in_ratio = float(width) / float(height)
        if in_ratio < min(self.ratio):
            w = width
            h = int(round(w / min(self.ratio)))
        elif in_ratio > max(self.ratio):
            h = height
            w = int(round(h * max(self.ratio)))
508 509
        else:
            # return whole image
L
LielinJiang 已提交
510 511
            w = width
            h = height
512 513 514
        i = (height - h) // 2
        j = (width - w) // 2
        return i, j, h, w
L
LielinJiang 已提交
515

516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609
    def _static_get_param(self, image, attempts=10):
        width, height = _get_image_size(image)
        area = height * width
        log_ratio = tuple(math.log(x) for x in self.ratio)

        counter = paddle.full(
            shape=[1], fill_value=0, dtype='int32'
        )  # loop counter

        ten = paddle.full(
            shape=[1], fill_value=10, dtype='int32'
        )  # loop length

        i = paddle.zeros([1], dtype="int32")
        j = paddle.zeros([1], dtype="int32")
        h = paddle.ones([1], dtype="int32") * (height + 1)
        w = paddle.ones([1], dtype="int32") * (width + 1)

        def cond(counter, ten, i, j, h, w):
            return (counter < ten) and (w > width or h > height)

        def body(counter, ten, i, j, h, w):
            target_area = (
                paddle.uniform(shape=[1], min=self.scale[0], max=self.scale[1])
                * area
            )
            aspect_ratio = paddle.exp(
                paddle.uniform(shape=[1], min=log_ratio[0], max=log_ratio[1])
            )

            w = paddle.round(paddle.sqrt(target_area * aspect_ratio)).astype(
                'int32'
            )
            h = paddle.round(paddle.sqrt(target_area / aspect_ratio)).astype(
                'int32'
            )

            i = paddle.static.nn.cond(
                0 < w <= width and 0 < h <= height,
                lambda: paddle.uniform(shape=[1], min=0, max=height - h).astype(
                    "int32"
                ),
                lambda: i,
            )

            j = paddle.static.nn.cond(
                0 < w <= width and 0 < h <= height,
                lambda: paddle.uniform(shape=[1], min=0, max=width - w).astype(
                    "int32"
                ),
                lambda: j,
            )

            counter += 1

            return counter, ten, i, j, h, w

        counter, ten, i, j, h, w = paddle.static.nn.while_loop(
            cond, body, [counter, ten, i, j, h, w]
        )

        def central_crop(width, height):
            height = paddle.assign([height]).astype("float32")
            width = paddle.assign([width]).astype("float32")

            # Fallback to central crop
            in_ratio = width / height

            w, h = paddle.static.nn.cond(
                in_ratio < self.ratio[0],
                lambda: [
                    width.astype("int32"),
                    paddle.round(width / self.ratio[0]).astype("int32"),
                ],
                lambda: paddle.static.nn.cond(
                    in_ratio > self.ratio[1],
                    lambda: [
                        paddle.round(height * self.ratio[1]),
                        height.astype("int32"),
                    ],
                    lambda: [width.astype("int32"), height.astype("int32")],
                ),
            )
            i = (height.astype("int32") - h) // 2
            j = (width.astype("int32") - w) // 2

            return i, j, h, w, counter

        return paddle.static.nn.cond(
            0 < w <= width and 0 < h <= height,
            lambda: [i, j, h, w, counter],
            lambda: central_crop(width, height),
        )

610
    def _apply_image(self, img):
611 612 613 614
        if paddle.in_dynamic_mode():
            i, j, h, w = self._dynamic_get_param(img)
        else:
            i, j, h, w, counter = self._static_get_param(img)
L
LielinJiang 已提交
615

616
        cropped_img = F.crop(img, i, j, h, w)
L
LielinJiang 已提交
617 618 619
        return F.resize(cropped_img, self.size, self.interpolation)


620
class CenterCrop(BaseTransform):
L
LielinJiang 已提交
621 622 623
    """Crops the given the input data at the center.

    Args:
624 625 626
        size (int|list|tuple): Target size of output image, with (height, width) shape.
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.

627 628 629 630 631 632 633
    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): A cropped image.

    Returns:
        A callable object of CenterCrop.

L
LielinJiang 已提交
634
    Examples:
635

L
LielinJiang 已提交
636 637
        .. code-block:: python

638 639 640
            >>> import numpy as np
            >>> from PIL import Image
            >>> from paddle.vision.transforms import CenterCrop
L
LielinJiang 已提交
641

642 643 644 645 646
            >>> transform = CenterCrop(224)
            >>> fake_img = Image.fromarray((np.random.rand(300, 320, 3) * 255.).astype(np.uint8))
            >>> fake_img = transform(fake_img)
            >>> print(fake_img.size)
            (224, 224)
L
LielinJiang 已提交
647 648 649

    """

650
    def __init__(self, size, keys=None):
651
        super().__init__(keys)
652 653
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
L
LielinJiang 已提交
654
        else:
655
            self.size = size
L
LielinJiang 已提交
656

657 658
    def _apply_image(self, img):
        return F.center_crop(img, self.size)
L
LielinJiang 已提交
659 660


661
class RandomHorizontalFlip(BaseTransform):
L
LielinJiang 已提交
662 663 664
    """Horizontally flip the input data randomly with a given probability.

    Args:
B
Bin Lu 已提交
665
        prob (float, optional): Probability of the input data being flipped. Should be in [0, 1]. Default: 0.5
666
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
L
LielinJiang 已提交
667

668 669 670 671 672 673 674
    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): A horiziotal flipped image.

    Returns:
        A callable object of RandomHorizontalFlip.

L
LielinJiang 已提交
675
    Examples:
676

L
LielinJiang 已提交
677 678
        .. code-block:: python

679 680 681
            >>> import numpy as np
            >>> from PIL import Image
            >>> from paddle.vision.transforms import RandomHorizontalFlip
L
LielinJiang 已提交
682

683 684 685 686 687
            >>> transform = RandomHorizontalFlip(0.5)
            >>> fake_img = Image.fromarray((np.random.rand(300, 320, 3) * 255.).astype(np.uint8))
            >>> fake_img = transform(fake_img)
            >>> print(fake_img.size)
            (320, 300)
L
LielinJiang 已提交
688 689
    """

690
    def __init__(self, prob=0.5, keys=None):
691
        super().__init__(keys)
I
IMMORTAL 已提交
692
        assert 0 <= prob <= 1, "probability must be between 0 and 1"
L
LielinJiang 已提交
693 694
        self.prob = prob

695
    def _apply_image(self, img):
696 697 698 699 700 701
        if paddle.in_dynamic_mode():
            return self._dynamic_apply_image(img)
        else:
            return self._static_apply_image(img)

    def _dynamic_apply_image(self, img):
702 703
        if random.random() < self.prob:
            return F.hflip(img)
L
LielinJiang 已提交
704 705
        return img

706 707 708 709 710 711 712
    def _static_apply_image(self, img):
        return paddle.static.nn.cond(
            paddle.rand(shape=(1,)) < self.prob,
            lambda: F.hflip(img),
            lambda: img,
        )

L
LielinJiang 已提交
713

714
class RandomVerticalFlip(BaseTransform):
L
LielinJiang 已提交
715 716 717
    """Vertically flip the input data randomly with a given probability.

    Args:
718 719
        prob (float, optional): Probability of the input data being flipped. Default: 0.5
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
L
LielinJiang 已提交
720

721 722 723 724 725 726 727
    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): A vertical flipped image.

    Returns:
        A callable object of RandomVerticalFlip.

L
LielinJiang 已提交
728
    Examples:
729

L
LielinJiang 已提交
730 731
        .. code-block:: python

732 733 734 735 736 737 738 739
            >>> import numpy as np
            >>> from PIL import Image
            >>> from paddle.vision.transforms import RandomVerticalFlip
            >>> transform = RandomVerticalFlip()
            >>> fake_img = Image.fromarray((np.random.rand(300, 320, 3) * 255.).astype(np.uint8))
            >>> fake_img = transform(fake_img)
            >>> print(fake_img.size)
            (320, 300)
740

L
LielinJiang 已提交
741 742
    """

743
    def __init__(self, prob=0.5, keys=None):
744
        super().__init__(keys)
I
IMMORTAL 已提交
745
        assert 0 <= prob <= 1, "probability must be between 0 and 1"
L
LielinJiang 已提交
746 747
        self.prob = prob

748
    def _apply_image(self, img):
749 750 751 752 753 754
        if paddle.in_dynamic_mode():
            return self._dynamic_apply_image(img)
        else:
            return self._static_apply_image(img)

    def _dynamic_apply_image(self, img):
755 756
        if random.random() < self.prob:
            return F.vflip(img)
L
LielinJiang 已提交
757 758
        return img

759 760 761 762 763 764 765
    def _static_apply_image(self, img):
        return paddle.static.nn.cond(
            paddle.rand(shape=(1,)) < self.prob,
            lambda: F.vflip(img),
            lambda: img,
        )

L
LielinJiang 已提交
766

767
class Normalize(BaseTransform):
L
LielinJiang 已提交
768 769 770 771 772 773
    """Normalize the input data with mean and standard deviation.
    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels,
    this transform will normalize each channel of the input data.
    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``

    Args:
774 775
        mean (int|float|list|tuple, optional): Sequence of means for each channel.
        std (int|float|list|tuple, optional): Sequence of standard deviations for each channel.
776
        data_format (str, optional): Data format of img, should be 'HWC' or
777 778 779
            'CHW'. Default: 'CHW'.
        to_rgb (bool, optional): Whether to convert to rgb. Default: False.
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
780 781 782 783 784 785 786 787

    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): A normalized array or tensor.

    Returns:
        A callable object of Normalize.

L
LielinJiang 已提交
788
    Examples:
789

L
LielinJiang 已提交
790
        .. code-block:: python
791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806
            :name: code-example

            >>> import paddle
            >>> from paddle.vision.transforms import Normalize
            >>> paddle.seed(2023)

            >>> normalize = Normalize(mean=[127.5, 127.5, 127.5],
            ...                         std=[127.5, 127.5, 127.5],
            ...                         data_format='HWC')
            ...
            >>> fake_img = paddle.rand([300,320,3]).numpy() * 255.
            >>> fake_img = normalize(fake_img)
            >>> print(fake_img.shape)
            (300, 320, 3)
            >>> print(fake_img.max(), fake_img.min())
            0.99999464 -0.9999929
807

L
LielinJiang 已提交
808 809
    """

810 811 812
    def __init__(
        self, mean=0.0, std=1.0, data_format='CHW', to_rgb=False, keys=None
    ):
813
        super().__init__(keys)
L
LielinJiang 已提交
814 815 816 817
        if isinstance(mean, numbers.Number):
            mean = [mean, mean, mean]

        if isinstance(std, numbers.Number):
L
LielinJiang 已提交
818
            std = [std, std, std]
L
LielinJiang 已提交
819

820 821 822 823
        self.mean = mean
        self.std = std
        self.data_format = data_format
        self.to_rgb = to_rgb
L
LielinJiang 已提交
824

825
    def _apply_image(self, img):
826 827 828
        return F.normalize(
            img, self.mean, self.std, self.data_format, self.to_rgb
        )
L
LielinJiang 已提交
829 830


831 832
class Transpose(BaseTransform):
    """Transpose input data to a target format.
L
LielinJiang 已提交
833 834
    For example, most transforms use HWC mode image,
    while the Neural Network might use CHW mode input tensor.
835
    output image will be an instance of numpy.ndarray.
L
LielinJiang 已提交
836 837

    Args:
838 839
        order (list|tuple, optional): Target order of input data. Default: (2, 0, 1).
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
840

841 842
    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
843
        - output(np.ndarray|Paddle.Tensor): A transposed array or tensor. If input
844 845 846 847 848
            is a PIL.Image, output will be converted to np.ndarray automatically.

    Returns:
        A callable object of Transpose.

L
LielinJiang 已提交
849
    Examples:
850

L
LielinJiang 已提交
851 852
        .. code-block:: python

853 854 855
            >>> import numpy as np
            >>> from PIL import Image
            >>> from paddle.vision.transforms import Transpose
L
LielinJiang 已提交
856

857 858 859 860 861
            >>> transform = Transpose()
            >>> fake_img = Image.fromarray((np.random.rand(300, 320, 3) * 255.).astype(np.uint8))
            >>> fake_img = transform(fake_img)
            >>> print(fake_img.shape)
            (3, 300, 320)
862

L
LielinJiang 已提交
863 864
    """

865
    def __init__(self, order=(2, 0, 1), keys=None):
866
        super().__init__(keys)
867 868 869
        self.order = order

    def _apply_image(self, img):
870 871 872
        if F._is_tensor_image(img):
            return img.transpose(self.order)

873 874
        if F._is_pil_image(img):
            img = np.asarray(img)
L
LielinJiang 已提交
875

876 877
        if len(img.shape) == 2:
            img = img[..., np.newaxis]
878
        return img.transpose(self.order)
L
LielinJiang 已提交
879 880


881
class BrightnessTransform(BaseTransform):
L
LielinJiang 已提交
882 883 884 885
    """Adjust brightness of the image.

    Args:
        value (float): How much to adjust the brightness. Can be any
I
Infinity_lee 已提交
886
            non negative number. 0 gives the original image.
887
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
L
LielinJiang 已提交
888

889 890 891 892 893 894 895
    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): An image with a transform in brghtness.

    Returns:
        A callable object of BrightnessTransform.

L
LielinJiang 已提交
896
    Examples:
897

L
LielinJiang 已提交
898 899
        .. code-block:: python

900 901 902 903
            >>> import numpy as np
            >>> from PIL import Image
            >>> from paddle.vision.transforms import BrightnessTransform
            >>> np.random.seed(2023)
L
LielinJiang 已提交
904

905 906 907 908 909 910 911 912
            >>> transform = BrightnessTransform(0.4)
            >>> fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
            >>> print(fake_img.load()[1,1])
            (60, 169, 34)
            >>> # doctest: +SKIP('random sample in Brightness function')
            >>> fake_img = transform(fake_img)
            >>> print(fake_img.load()[1,1])
            (68, 192, 38)
913

L
LielinJiang 已提交
914 915
    """

916
    def __init__(self, value, keys=None):
917
        super().__init__(keys)
918
        self.value = _check_input(value, 'brightness')
L
LielinJiang 已提交
919

920 921
    def _apply_image(self, img):
        if self.value is None:
L
LielinJiang 已提交
922 923
            return img

924 925
        brightness_factor = random.uniform(self.value[0], self.value[1])
        return F.adjust_brightness(img, brightness_factor)
L
LielinJiang 已提交
926 927


928
class ContrastTransform(BaseTransform):
L
LielinJiang 已提交
929 930 931 932
    """Adjust contrast of the image.

    Args:
        value (float): How much to adjust the contrast. Can be any
I
Infinity_lee 已提交
933
            non negative number. 0 gives the original image.
934
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
L
LielinJiang 已提交
935

936 937 938 939 940 941 942
    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): An image with a transform in contrast.

    Returns:
        A callable object of ContrastTransform.

L
LielinJiang 已提交
943
    Examples:
944

L
LielinJiang 已提交
945 946
        .. code-block:: python

947 948 949
            >>> import numpy as np
            >>> from PIL import Image
            >>> from paddle.vision.transforms import ContrastTransform
L
LielinJiang 已提交
950

951 952 953 954 955
            >>> transform = ContrastTransform(0.4)
            >>> fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
            >>> fake_img = transform(fake_img)
            >>> print(fake_img.size)
            (224, 224)
956

L
LielinJiang 已提交
957 958
    """

959
    def __init__(self, value, keys=None):
960
        super().__init__(keys)
L
LielinJiang 已提交
961 962
        if value < 0:
            raise ValueError("contrast value should be non-negative")
963
        self.value = _check_input(value, 'contrast')
L
LielinJiang 已提交
964

965 966
    def _apply_image(self, img):
        if self.value is None:
L
LielinJiang 已提交
967 968
            return img

969 970
        contrast_factor = random.uniform(self.value[0], self.value[1])
        return F.adjust_contrast(img, contrast_factor)
L
LielinJiang 已提交
971 972


973
class SaturationTransform(BaseTransform):
L
LielinJiang 已提交
974 975 976 977
    """Adjust saturation of the image.

    Args:
        value (float): How much to adjust the saturation. Can be any
I
Infinity_lee 已提交
978
            non negative number. 0 gives the original image.
979
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
L
LielinJiang 已提交
980

981 982 983 984 985 986 987
    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): An image with a transform in saturation.

    Returns:
        A callable object of SaturationTransform.

L
LielinJiang 已提交
988
    Examples:
989

L
LielinJiang 已提交
990 991
        .. code-block:: python

992 993 994
            >>> import numpy as np
            >>> from PIL import Image
            >>> from paddle.vision.transforms import SaturationTransform
995

996 997 998 999 1000
            >>> transform = SaturationTransform(0.4)
            >>> fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
            >>> fake_img = transform(fake_img)
            >>> print(fake_img.size)
            (224, 224)
L
LielinJiang 已提交
1001 1002
    """

1003
    def __init__(self, value, keys=None):
1004
        super().__init__(keys)
1005
        self.value = _check_input(value, 'saturation')
L
LielinJiang 已提交
1006

1007 1008
    def _apply_image(self, img):
        if self.value is None:
L
LielinJiang 已提交
1009 1010
            return img

1011 1012
        saturation_factor = random.uniform(self.value[0], self.value[1])
        return F.adjust_saturation(img, saturation_factor)
L
LielinJiang 已提交
1013

L
LielinJiang 已提交
1014

1015
class HueTransform(BaseTransform):
L
LielinJiang 已提交
1016 1017 1018 1019
    """Adjust hue of the image.

    Args:
        value (float): How much to adjust the hue. Can be any number
I
Infinity_lee 已提交
1020
            between 0 and 0.5, 0 gives the original image.
1021
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
L
LielinJiang 已提交
1022

1023 1024 1025 1026 1027 1028 1029
    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): An image with a transform in hue.

    Returns:
        A callable object of HueTransform.

L
LielinJiang 已提交
1030
    Examples:
1031

L
LielinJiang 已提交
1032 1033
        .. code-block:: python

1034 1035 1036
            >>> import numpy as np
            >>> from PIL import Image
            >>> from paddle.vision.transforms import HueTransform
L
LielinJiang 已提交
1037

1038 1039 1040 1041 1042
            >>> transform = HueTransform(0.4)
            >>> fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
            >>> fake_img = transform(fake_img)
            >>> print(fake_img.size)
            (224, 224)
1043

L
LielinJiang 已提交
1044 1045
    """

1046
    def __init__(self, value, keys=None):
1047
        super().__init__(keys)
1048 1049 1050
        self.value = _check_input(
            value, 'hue', center=0, bound=(-0.5, 0.5), clip_first_on_zero=False
        )
L
LielinJiang 已提交
1051

1052 1053
    def _apply_image(self, img):
        if self.value is None:
L
LielinJiang 已提交
1054 1055
            return img

1056 1057
        hue_factor = random.uniform(self.value[0], self.value[1])
        return F.adjust_hue(img, hue_factor)
L
LielinJiang 已提交
1058 1059


1060
class ColorJitter(BaseTransform):
L
LielinJiang 已提交
1061 1062 1063
    """Randomly change the brightness, contrast, saturation and hue of an image.

    Args:
I
Infinity_lee 已提交
1064 1065 1066 1067 1068 1069 1070 1071
        brightness (float, optional): How much to jitter brightness.
            Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]. Should be non negative numbers. Default: 0.
        contrast (float, optional): How much to jitter contrast.
            Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]. Should be non negative numbers. Default: 0.
        saturation (float, optional): How much to jitter saturation.
            Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]. Should be non negative numbers. Default: 0.
        hue (float, optional): How much to jitter hue.
            Chosen uniformly from [-hue, hue]. Should have 0<= hue <= 0.5. Default: 0.
1072
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
L
LielinJiang 已提交
1073

1074 1075 1076 1077 1078 1079 1080
    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): A color jittered image.

    Returns:
        A callable object of ColorJitter.

L
LielinJiang 已提交
1081
    Examples:
1082

L
LielinJiang 已提交
1083 1084
        .. code-block:: python

1085 1086 1087
            >>> import numpy as np
            >>> from PIL import Image
            >>> from paddle.vision.transforms import ColorJitter
L
LielinJiang 已提交
1088

1089 1090 1091 1092 1093
            >>> transform = ColorJitter(0.4, 0.4, 0.4, 0.4)
            >>> fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
            >>> fake_img = transform(fake_img)
            >>> print(fake_img.size)
            (224, 224)
1094

L
LielinJiang 已提交
1095 1096
    """

1097 1098 1099
    def __init__(
        self, brightness=0, contrast=0, saturation=0, hue=0, keys=None
    ):
1100
        super().__init__(keys)
1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114
        self.brightness = brightness
        self.contrast = contrast
        self.saturation = saturation
        self.hue = hue

    def _get_param(self, brightness, contrast, saturation, hue):
        """Get a randomized transform to be applied on image.

        Arguments are same as that of __init__.

        Returns:
            Transform which randomly adjusts brightness, contrast and
            saturation in a random order.
        """
L
LielinJiang 已提交
1115
        transforms = []
1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127

        if brightness is not None:
            transforms.append(BrightnessTransform(brightness, self.keys))

        if contrast is not None:
            transforms.append(ContrastTransform(contrast, self.keys))

        if saturation is not None:
            transforms.append(SaturationTransform(saturation, self.keys))

        if hue is not None:
            transforms.append(HueTransform(hue, self.keys))
L
LielinJiang 已提交
1128 1129

        random.shuffle(transforms)
1130
        transform = Compose(transforms)
L
LielinJiang 已提交
1131

1132
        return transform
L
LielinJiang 已提交
1133

1134 1135 1136 1137
    def _apply_image(self, img):
        """
        Args:
            img (PIL Image): Input image.
L
LielinJiang 已提交
1138

1139 1140 1141
        Returns:
            PIL Image: Color jittered image.
        """
1142 1143 1144
        transform = self._get_param(
            self.brightness, self.contrast, self.saturation, self.hue
        )
1145 1146 1147 1148
        return transform(img)


class RandomCrop(BaseTransform):
L
LielinJiang 已提交
1149 1150 1151 1152 1153 1154
    """Crops the given CV Image at a random location.

    Args:
        size (sequence|int): Desired output size of the crop. If size is an
            int instead of sequence like (h, w), a square crop (size, size) is
            made.
1155
        padding (int|sequence, optional): Optional padding on each border
1156
            of the image. If a sequence of length 4 is provided, it is used to pad left,
1157 1158
            top, right, bottom borders respectively. Default: None, without padding.
        pad_if_needed (boolean, optional): It will pad the image if smaller than the
L
LielinJiang 已提交
1159
            desired size to avoid raising an exception. Default: False.
1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177
        fill (float|tuple, optional): Pixel fill value for constant fill. If a tuple of
            length 3, it is used to fill R, G, B channels respectively.
            This value is only used when the padding_mode is constant. Default: 0.
        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default: 'constant'.

            - constant: pads with a constant value, this value is specified with fill

            - edge: pads with the last value on the edge of the image

            - reflect: pads with reflection of image (without repeating the last value on the edge)

                   padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
                   will result in [3, 2, 1, 2, 3, 4, 3, 2]

            - symmetric: pads with reflection of image (repeating the last value on the edge)

                     padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
                     will result in [2, 1, 1, 2, 3, 4, 4, 3]
1178
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
1179

1180
    Shape
1181 1182 1183 1184 1185 1186
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): A random cropped image.

    Returns:
        A callable object of RandomCrop.

L
LielinJiang 已提交
1187
    Examples:
1188

L
LielinJiang 已提交
1189
        .. code-block:: python
1190
            :name: code-example1
L
LielinJiang 已提交
1191

1192 1193 1194
            >>> import paddle
            >>> from paddle.vision.transforms import RandomCrop
            >>> transform = RandomCrop(224)
L
LielinJiang 已提交
1195

1196 1197 1198
            >>> fake_img = paddle.randint(0, 255, shape=(3, 324,300), dtype = 'int32')
            >>> print(fake_img.shape)
            [3, 324, 300]
L
LielinJiang 已提交
1199

1200 1201 1202
            >>> crop_img = transform(fake_img)
            >>> print(crop_img.shape)
            [3, 224, 224]
L
LielinJiang 已提交
1203 1204
    """

1205 1206 1207 1208 1209 1210 1211 1212 1213
    def __init__(
        self,
        size,
        padding=None,
        pad_if_needed=False,
        fill=0,
        padding_mode='constant',
        keys=None,
    ):
1214
        super().__init__(keys)
L
LielinJiang 已提交
1215 1216 1217 1218 1219 1220
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size
        self.padding = padding
        self.pad_if_needed = pad_if_needed
1221 1222
        self.fill = fill
        self.padding_mode = padding_mode
L
LielinJiang 已提交
1223

1224
    def _get_param(self, img, output_size):
L
LielinJiang 已提交
1225 1226 1227
        """Get parameters for ``crop`` for a random crop.

        Args:
1228
            img (PIL Image): Image to be cropped.
L
LielinJiang 已提交
1229 1230 1231 1232 1233
            output_size (tuple): Expected output size of the crop.

        Returns:
            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
        """
1234
        w, h = _get_image_size(img)
L
LielinJiang 已提交
1235 1236 1237 1238
        th, tw = output_size
        if w == tw and h == th:
            return 0, 0, h, w

1239 1240 1241 1242 1243 1244
        if paddle.in_dynamic_mode():
            i = random.randint(0, h - th)
            j = random.randint(0, w - tw)
        else:
            i = paddle.randint(low=0, high=h - th)
            j = paddle.randint(low=0, high=w - tw)
L
LielinJiang 已提交
1245 1246
        return i, j, th, tw

1247
    def _apply_image(self, img):
L
LielinJiang 已提交
1248 1249
        """
        Args:
1250
            img (PIL Image): Image to be cropped.
L
LielinJiang 已提交
1251

1252 1253
        Returns:
            PIL Image: Cropped image.
L
LielinJiang 已提交
1254
        """
1255 1256 1257 1258
        if self.padding is not None:
            img = F.pad(img, self.padding, self.fill, self.padding_mode)

        w, h = _get_image_size(img)
L
LielinJiang 已提交
1259 1260

        # pad the width if needed
1261
        if self.pad_if_needed and w < self.size[1]:
1262 1263 1264
            img = F.pad(
                img, (self.size[1] - w, 0), self.fill, self.padding_mode
            )
L
LielinJiang 已提交
1265
        # pad the height if needed
1266
        if self.pad_if_needed and h < self.size[0]:
1267 1268 1269
            img = F.pad(
                img, (0, self.size[0] - h), self.fill, self.padding_mode
            )
L
LielinJiang 已提交
1270

1271
        i, j, h, w = self._get_param(img, self.size)
L
LielinJiang 已提交
1272

1273
        return F.crop(img, i, j, h, w)
L
LielinJiang 已提交
1274 1275


1276
class Pad(BaseTransform):
L
LielinJiang 已提交
1277 1278 1279 1280
    """Pads the given CV Image on all sides with the given "pad" value.

    Args:
        padding (int|list|tuple): Padding on each border. If a single int is provided this
1281 1282
            is used to pad all borders. If list/tuple of length 2 is provided this is the padding
            on left/right and top/bottom respectively. If a list/tuple of length 4 is provided
L
LielinJiang 已提交
1283 1284
            this is the padding for the left, top, right and bottom borders
            respectively.
1285
        fill (int|list|tuple): Pixel fill value for constant fill. Default is 0. If a list/tuple of
L
LielinJiang 已提交
1286 1287 1288
            length 3, it is used to fill R, G, B channels respectively.
            This value is only used when the padding_mode is constant
        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
1289 1290 1291 1292
            ``constant`` means pads with a constant value, this value is specified with fill.
            ``edge`` means pads with the last value at the edge of the image.
            ``reflect`` means pads with reflection of image (without repeating the last value on the edge)
            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in reflect mode
L
LielinJiang 已提交
1293 1294
            will result in ``[3, 2, 1, 2, 3, 4, 3, 2]``.
            ``symmetric`` menas pads with reflection of image (repeating the last value on the edge)
1295
            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in symmetric mode
L
LielinJiang 已提交
1296
            will result in ``[2, 1, 1, 2, 3, 4, 4, 3]``.
1297
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
1298

1299 1300 1301 1302 1303 1304 1305
    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): A paded image.

    Returns:
        A callable object of Pad.

L
LielinJiang 已提交
1306
    Examples:
1307

L
LielinJiang 已提交
1308 1309
        .. code-block:: python

1310 1311 1312
            >>> import numpy as np
            >>> from PIL import Image
            >>> from paddle.vision.transforms import Pad
L
LielinJiang 已提交
1313

1314 1315 1316 1317 1318
            >>> transform = Pad(2)
            >>> fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
            >>> fake_img = transform(fake_img)
            >>> print(fake_img.size)
            (228, 228)
L
LielinJiang 已提交
1319 1320
    """

1321
    def __init__(self, padding, fill=0, padding_mode='constant', keys=None):
L
LielinJiang 已提交
1322 1323 1324
        assert isinstance(padding, (numbers.Number, list, tuple))
        assert isinstance(fill, (numbers.Number, str, list, tuple))
        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
1325 1326 1327 1328 1329 1330 1331

        if isinstance(padding, list):
            padding = tuple(padding)
        if isinstance(fill, list):
            fill = tuple(fill)

        if isinstance(padding, Sequence) and len(padding) not in [2, 4]:
L
LielinJiang 已提交
1332
            raise ValueError(
1333
                "Padding must be an int or a 2, or 4 element tuple, not a "
1334
                + f"{len(padding)} element tuple"
1335
            )
L
LielinJiang 已提交
1336

1337
        super().__init__(keys)
L
LielinJiang 已提交
1338 1339 1340 1341
        self.padding = padding
        self.fill = fill
        self.padding_mode = padding_mode

1342
    def _apply_image(self, img):
L
LielinJiang 已提交
1343 1344
        """
        Args:
1345 1346
            img (PIL Image): Image to be padded.

L
LielinJiang 已提交
1347
        Returns:
1348
            PIL Image: Padded image.
L
LielinJiang 已提交
1349 1350 1351 1352
        """
        return F.pad(img, self.padding, self.fill, self.padding_mode)


1353
def _check_sequence_input(x, name, req_sizes):
1354 1355 1356 1357 1358
    msg = (
        req_sizes[0]
        if len(req_sizes) < 2
        else " or ".join([str(s) for s in req_sizes])
    )
1359 1360 1361 1362 1363 1364
    if not isinstance(x, Sequence):
        raise TypeError(f"{name} should be a sequence of length {msg}.")
    if len(x) not in req_sizes:
        raise ValueError(f"{name} should be sequence of length {msg}.")


1365
def _setup_angle(x, name, req_sizes=(2,)):
1366 1367 1368
    if isinstance(x, numbers.Number):
        if x < 0:
            raise ValueError(
1369 1370
                f"If {name} is a single number, it must be positive."
            )
1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386
        x = [-x, x]
    else:
        _check_sequence_input(x, name, req_sizes)

    return [float(d) for d in x]


class RandomAffine(BaseTransform):
    """Random affine transformation of the image.

    Args:
        degrees (int|float|tuple): The angle interval of the random rotation.
            If set as a number instead of sequence like (min, max), the range of degrees
            will be (-degrees, +degrees) in clockwise order. If set 0, will not rotate.
        translate (tuple, optional): Maximum absolute fraction for horizontal and vertical translations.
            For example translate=(a, b), then horizontal shift is randomly sampled in the range -img_width * a < dx < img_width * a
1387
            and vertical shift is randomly sampled in the range -img_height * b < dy < img_height * b.
1388
            Default is None, will not translate.
1389
        scale (tuple, optional): Scaling factor interval, e.g (a, b), then scale is randomly sampled from the range a <= scale <= b.
1390 1391
            Default is None, will keep original scale and not scale.
        shear (sequence or number, optional): Range of degrees to shear, ranges from -180 to 180 in clockwise order.
1392 1393
            If set as a number, a shear parallel to the x axis in the range (-shear, +shear) will be applied.
            Else if set as a sequence of 2 values a shear parallel to the x axis in the range (shear[0], shear[1]) will be applied.
1394 1395
            Else if set as a sequence of 4 values, a x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
            Default is None, will not apply shear.
1396 1397 1398 1399 1400 1401
        interpolation (str, optional): Interpolation method. If omitted, or if the
            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST
            according the backend.
            When use pil backend, support method are as following:
            - "nearest": Image.NEAREST,
            - "bilinear": Image.BILINEAR,
1402
            - "bicubic": Image.BICUBIC
1403 1404 1405
            When use cv2 backend, support method are as following:
            - "nearest": cv2.INTER_NEAREST,
            - "bilinear": cv2.INTER_LINEAR,
1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421
            - "bicubic": cv2.INTER_CUBIC
        fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
            image. If given a number, the value is used for all bands respectively.
        center (2-tuple, optional): Optional center of rotation, (x, y).
            Origin is the upper left corner.
            Default is the center of the image.
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.

    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): An affined image.

    Returns:
        A callable object of RandomAffine.

    Examples:
1422

1423 1424
        .. code-block:: python

1425 1426
            >>> import paddle
            >>> from paddle.vision.transforms import RandomAffine
1427

1428 1429 1430 1431 1432
            >>> transform = RandomAffine([-90, 90], translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[-10, 10])
            >>> fake_img = paddle.randn((3, 256, 300)).astype(paddle.float32)
            >>> fake_img = transform(fake_img)
            >>> print(fake_img.shape)
            [3, 256, 300]
1433 1434
    """

1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446
    def __init__(
        self,
        degrees,
        translate=None,
        scale=None,
        shear=None,
        interpolation='nearest',
        fill=0,
        center=None,
        keys=None,
    ):
        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
1447

1448
        super().__init__(keys)
1449 1450 1451 1452
        assert interpolation in ['nearest', 'bilinear', 'bicubic']
        self.interpolation = interpolation

        if translate is not None:
1453
            _check_sequence_input(translate, "translate", req_sizes=(2,))
1454 1455 1456
            for t in translate:
                if not (0.0 <= t <= 1.0):
                    raise ValueError(
1457 1458
                        "translation values should be between 0 and 1"
                    )
1459 1460 1461
        self.translate = translate

        if scale is not None:
1462
            _check_sequence_input(scale, "scale", req_sizes=(2,))
1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479
            for s in scale:
                if s <= 0:
                    raise ValueError("scale values should be positive")
        self.scale = scale

        if shear is not None:
            self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4))
        else:
            self.shear = shear

        if fill is None:
            fill = 0
        elif not isinstance(fill, (Sequence, numbers.Number)):
            raise TypeError("Fill should be either a sequence or a number.")
        self.fill = fill

        if center is not None:
1480
            _check_sequence_input(center, "center", req_sizes=(2,))
1481 1482
        self.center = center

1483 1484 1485
    def _get_param(
        self, img_size, degrees, translate=None, scale_ranges=None, shears=None
    ):
1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527
        """Get parameters for affine transformation

        Returns:
            params to be passed to the affine transformation
        """
        angle = random.uniform(degrees[0], degrees[1])

        if translate is not None:
            max_dx = float(translate[0] * img_size[0])
            max_dy = float(translate[1] * img_size[1])
            tx = int(random.uniform(-max_dx, max_dx))
            ty = int(random.uniform(-max_dy, max_dy))
            translations = (tx, ty)
        else:
            translations = (0, 0)

        if scale_ranges is not None:
            scale = random.uniform(scale_ranges[0], scale_ranges[1])
        else:
            scale = 1.0

        shear_x, shear_y = 0.0, 0.0
        if shears is not None:
            shear_x = random.uniform(shears[0], shears[1])
            if len(shears) == 4:
                shear_y = random.uniform(shears[2], shears[3])
        shear = (shear_x, shear_y)

        return angle, translations, scale, shear

    def _apply_image(self, img):
        """
        Args:
            img (PIL.Image|np.array): Image to be affine transformed.

        Returns:
            PIL.Image or np.array: Affine transformed image.
        """

        w, h = _get_image_size(img)
        img_size = [w, h]

1528 1529 1530
        ret = self._get_param(
            img_size, self.degrees, self.translate, self.scale, self.shear
        )
1531

1532 1533 1534 1535 1536 1537 1538
        return F.affine(
            img,
            *ret,
            interpolation=self.interpolation,
            fill=self.fill,
            center=self.center,
        )
1539 1540


1541
class RandomRotation(BaseTransform):
L
LielinJiang 已提交
1542 1543 1544 1545 1546 1547
    """Rotates the image by angle.

    Args:
        degrees (sequence or float or int): Range of degrees to select from.
            If degrees is a number instead of sequence like (min, max), the range of degrees
            will be (-degrees, +degrees) clockwise order.
1548 1549 1550 1551 1552
        interpolation (str, optional): Interpolation method. If omitted, or if the
            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST
            according the backend. when use pil backend, support method are as following:
            - "nearest": Image.NEAREST,
            - "bilinear": Image.BILINEAR,
1553
            - "bicubic": Image.BICUBIC
1554 1555 1556
            when use cv2 backend, support method are as following:
            - "nearest": cv2.INTER_NEAREST,
            - "bilinear": cv2.INTER_LINEAR,
1557
            - "bicubic": cv2.INTER_CUBIC
L
LielinJiang 已提交
1558 1559 1560 1561 1562 1563 1564
        expand (bool|optional): Optional expansion flag. Default: False.
            If true, expands the output to make it large enough to hold the entire rotated image.
            If false or omitted, make the output image the same size as the input image.
            Note that the expand flag assumes rotation around the center and no translation.
        center (2-tuple|optional): Optional center of rotation.
            Origin is the upper left corner.
            Default is the center of the image.
1565
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
1566

1567 1568 1569 1570 1571 1572 1573
    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): A rotated image.

    Returns:
        A callable object of RandomRotation.

L
LielinJiang 已提交
1574
    Examples:
1575

L
LielinJiang 已提交
1576 1577
        .. code-block:: python

1578 1579 1580
            >>> import numpy as np
            >>> from PIL import Image
            >>> from paddle.vision.transforms import RandomRotation
L
LielinJiang 已提交
1581

1582 1583 1584 1585 1586
            >>> transform = RandomRotation(90)
            >>> fake_img = Image.fromarray((np.random.rand(200, 150, 3) * 255.).astype(np.uint8))
            >>> fake_img = transform(fake_img)
            >>> print(fake_img.size)
            (150, 200)
L
LielinJiang 已提交
1587 1588
    """

1589 1590 1591 1592 1593 1594 1595 1596 1597
    def __init__(
        self,
        degrees,
        interpolation='nearest',
        expand=False,
        center=None,
        fill=0,
        keys=None,
    ):
L
LielinJiang 已提交
1598 1599 1600
        if isinstance(degrees, numbers.Number):
            if degrees < 0:
                raise ValueError(
1601 1602
                    "If degrees is a single number, it must be positive."
                )
L
LielinJiang 已提交
1603 1604 1605 1606
            self.degrees = (-degrees, degrees)
        else:
            if len(degrees) != 2:
                raise ValueError(
1607 1608
                    "If degrees is a sequence, it must be of len 2."
                )
L
LielinJiang 已提交
1609 1610
            self.degrees = degrees

1611
        super().__init__(keys)
1612
        self.interpolation = interpolation
L
LielinJiang 已提交
1613 1614
        self.expand = expand
        self.center = center
1615
        self.fill = fill
L
LielinJiang 已提交
1616

1617
    def _get_param(self, degrees):
1618 1619 1620 1621 1622 1623
        if paddle.in_dynamic_mode():
            angle = random.uniform(degrees[0], degrees[1])
        else:
            angle = paddle.uniform(
                [1], dtype="float32", min=degrees[0], max=degrees[1]
            )
L
LielinJiang 已提交
1624 1625 1626

        return angle

1627
    def _apply_image(self, img):
L
LielinJiang 已提交
1628
        """
1629 1630 1631
        Args:
            img (PIL.Image|np.array): Image to be rotated.

L
LielinJiang 已提交
1632
        Returns:
1633
            PIL.Image or np.array: Rotated image.
L
LielinJiang 已提交
1634 1635
        """

1636
        angle = self._get_param(self.degrees)
L
LielinJiang 已提交
1637

1638 1639 1640
        return F.rotate(
            img, angle, self.interpolation, self.expand, self.center, self.fill
        )
L
LielinJiang 已提交
1641 1642


1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653
class RandomPerspective(BaseTransform):
    """Random perspective transformation with a given probability.

    Args:
        prob (float, optional): Probability of using transformation, ranges from
            0 to 1, default is 0.5.
        distortion_scale (float, optional): Degree of distortion, ranges from
            0 to 1, default is 0.5.
        interpolation (str, optional): Interpolation method. If omitted, or if
            the image has only one channel, it is set to PIL.Image.NEAREST or
            cv2.INTER_NEAREST.
1654 1655 1656
            When use pil backend, support method are as following:
            - "nearest": Image.NEAREST,
            - "bilinear": Image.BILINEAR,
1657
            - "bicubic": Image.BICUBIC
1658 1659 1660
            When use cv2 backend, support method are as following:
            - "nearest": cv2.INTER_NEAREST,
            - "bilinear": cv2.INTER_LINEAR,
1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673
            - "bicubic": cv2.INTER_CUBIC
        fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
            image. If given a number, the value is used for all bands respectively.
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.

    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
        - output(PIL.Image|np.ndarray|Paddle.Tensor): A perspectived image.

    Returns:
        A callable object of RandomPerspective.

    Examples:
1674

1675 1676
        .. code-block:: python

1677 1678
            >>> import paddle
            >>> from paddle.vision.transforms import RandomPerspective
1679

1680 1681 1682 1683 1684
            >>> transform = RandomPerspective(prob=1.0, distortion_scale=0.9)
            >>> fake_img = paddle.randn((3, 200, 150)).astype(paddle.float32)
            >>> fake_img = transform(fake_img)
            >>> print(fake_img.shape)
            [3, 200, 150]
1685 1686
    """

1687 1688 1689 1690 1691 1692 1693 1694
    def __init__(
        self,
        prob=0.5,
        distortion_scale=0.5,
        interpolation='nearest',
        fill=0,
        keys=None,
    ):
1695
        super().__init__(keys)
1696
        assert 0 <= prob <= 1, "probability must be between 0 and 1"
1697 1698 1699
        assert (
            0 <= distortion_scale <= 1
        ), "distortion_scale must be between 0 and 1"
1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716
        assert interpolation in ['nearest', 'bilinear', 'bicubic']
        assert isinstance(fill, (numbers.Number, str, list, tuple))

        self.prob = prob
        self.distortion_scale = distortion_scale
        self.interpolation = interpolation
        self.fill = fill

    def get_params(self, width, height, distortion_scale):
        """
        Returns:
            startpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the original image,
            endpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the transformed image.
        """
        half_height = height // 2
        half_width = width // 2
        topleft = [
1717 1718
            int(random.uniform(0, int(distortion_scale * half_width) + 1)),
            int(random.uniform(0, int(distortion_scale * half_height) + 1)),
1719 1720 1721
        ]
        topright = [
            int(
1722 1723 1724 1725 1726
                random.uniform(
                    width - int(distortion_scale * half_width) - 1, width
                )
            ),
            int(random.uniform(0, int(distortion_scale * half_height) + 1)),
1727 1728 1729
        ]
        botright = [
            int(
1730 1731 1732 1733
                random.uniform(
                    width - int(distortion_scale * half_width) - 1, width
                )
            ),
1734
            int(
1735 1736 1737 1738
                random.uniform(
                    height - int(distortion_scale * half_height) - 1, height
                )
            ),
1739 1740
        ]
        botleft = [
1741
            int(random.uniform(0, int(distortion_scale * half_width) + 1)),
1742
            int(
1743 1744 1745 1746 1747 1748 1749 1750 1751 1752
                random.uniform(
                    height - int(distortion_scale * half_height) - 1, height
                )
            ),
        ]
        startpoints = [
            [0, 0],
            [width - 1, 0],
            [width - 1, height - 1],
            [0, height - 1],
1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769
        ]
        endpoints = [topleft, topright, botright, botleft]

        return startpoints, endpoints

    def _apply_image(self, img):
        """
        Args:
            img (PIL.Image|np.array|paddle.Tensor): Image to be Perspectively transformed.

        Returns:
            PIL.Image|np.array|paddle.Tensor: Perspectively transformed image.
        """

        width, height = _get_image_size(img)

        if random.random() < self.prob:
1770 1771 1772 1773 1774 1775
            startpoints, endpoints = self.get_params(
                width, height, self.distortion_scale
            )
            return F.perspective(
                img, startpoints, endpoints, self.interpolation, self.fill
            )
1776 1777 1778
        return img


1779
class Grayscale(BaseTransform):
L
LielinJiang 已提交
1780 1781 1782
    """Converts image to grayscale.

    Args:
I
Infinity_lee 已提交
1783
        num_output_channels (int, optional): (1 or 3) number of channels desired for output image. Default: 1.
1784
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
1785 1786 1787

    Shape:
        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
1788
        - output(PIL.Image|np.ndarray|Paddle.Tensor): Grayscale version of the input image.
1789 1790 1791
            - If output_channels == 1 : returned image is single channel
            - If output_channels == 3 : returned image is 3 channel with r == g == b

L
LielinJiang 已提交
1792
    Returns:
1793
        A callable object of Grayscale.
L
LielinJiang 已提交
1794 1795

    Examples:
1796

L
LielinJiang 已提交
1797 1798
        .. code-block:: python

1799 1800 1801
            >>> import numpy as np
            >>> from PIL import Image
            >>> from paddle.vision.transforms import Grayscale
L
LielinJiang 已提交
1802

1803 1804 1805 1806 1807
            >>> transform = Grayscale()
            >>> fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
            >>> fake_img = transform(fake_img)
            >>> print(np.array(fake_img).shape)
            (224, 224)
L
LielinJiang 已提交
1808 1809
    """

1810
    def __init__(self, num_output_channels=1, keys=None):
1811
        super().__init__(keys)
1812
        self.num_output_channels = num_output_channels
L
LielinJiang 已提交
1813

1814
    def _apply_image(self, img):
L
LielinJiang 已提交
1815 1816
        """
        Args:
1817 1818
            img (PIL Image): Image to be converted to grayscale.

L
LielinJiang 已提交
1819
        Returns:
1820
            PIL Image: Randomly grayscaled image.
L
LielinJiang 已提交
1821
        """
1822
        return F.to_grayscale(img, self.num_output_channels)
1823 1824 1825 1826 1827 1828 1829


class RandomErasing(BaseTransform):
    """Erase the pixels in a rectangle region selected randomly.

    Args:
        prob (float, optional): Probability of the input data being erased. Default: 0.5.
1830
        scale (sequence, optional): The proportional range of the erased area to the input image.
1831 1832 1833
                                    Default: (0.02, 0.33).
        ratio (sequence, optional): Aspect ratio range of the erased area. Default: (0.3, 3.3).
        value (int|float|sequence|str, optional): The value each pixel in erased area will be replaced with.
1834 1835 1836
                               If value is a single number, all pixels will be erased with this value.
                               If value is a sequence with length 3, the R, G, B channels will be ereased
                               respectively. If value is set to "random", each pixel will be erased with
1837 1838 1839
                               random values. Default: 0.
        inplace (bool, optional): Whether this transform is inplace. Default: False.
        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
1840

1841
    Shape:
1842
        - img(paddle.Tensor | np.array | PIL.Image): The input image. For Tensor input, the shape should be (C, H, W).
1843 1844 1845 1846 1847 1848 1849
                 For np.array input, the shape should be (H, W, C).
        - output(paddle.Tensor | np.array | PIL.Image): A random erased image.

    Returns:
        A callable object of RandomErasing.

    Examples:
1850

1851 1852
        .. code-block:: python

1853
            >>> import paddle
1854

1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865
            >>> fake_img = paddle.randn((1, 5, 5)).astype(paddle.float32)
            >>> transform = paddle.vision.transforms.RandomErasing()
            >>> result = transform(fake_img)
            >>> # doctest: +SKIP('random sample')
            >>> print(result)
            Tensor(shape=[1, 5, 5], dtype=float32, place=Place(gpu:0), stop_gradient=True,
            [[[-0.22141267, -0.71004093,  1.71224928,  2.99622107, -0.82959402],
              [ 0.36916021, -0.25601348,  0.86669374,  1.27504587, -0.56462914],
              [-0.45704395, -0.87613666,  1.12195814, -0.87974882,  0.04902615],
              [-0.91549885, -0.15066874,  1.26381516,  0.        ,  0.        ],
              [ 0.87887472, -1.59914243, -0.73970413,  0.        ,  0.        ]]])
J
JYChen 已提交
1866

1867 1868
    """

1869 1870 1871 1872 1873 1874 1875 1876 1877
    def __init__(
        self,
        prob=0.5,
        scale=(0.02, 0.33),
        ratio=(0.3, 3.3),
        value=0,
        inplace=False,
        keys=None,
    ):
1878
        super().__init__(keys)
1879
        assert isinstance(
1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896
            scale, (tuple, list)
        ), "scale should be a tuple or list"
        assert (
            scale[0] >= 0 and scale[1] <= 1 and scale[0] <= scale[1]
        ), "scale should be of kind (min, max) and in range [0, 1]"
        assert isinstance(
            ratio, (tuple, list)
        ), "ratio should be a tuple or list"
        assert (
            ratio[0] >= 0 and ratio[0] <= ratio[1]
        ), "ratio should be of kind (min, max)"
        assert (
            prob >= 0 and prob <= 1
        ), "The probability should be in range [0, 1]"
        assert isinstance(
            value, (numbers.Number, str, tuple, list)
        ), "value should be a number, tuple, list or str"
1897 1898 1899 1900 1901 1902 1903 1904 1905
        if isinstance(value, str) and value != "random":
            raise ValueError("value must be 'random' when type is str")

        self.prob = prob
        self.scale = scale
        self.ratio = ratio
        self.value = value
        self.inplace = inplace

1906 1907
    def _dynamic_get_param(self, img, scale, ratio, value):
        """Get parameters for ``erase`` for a random erasing in dynamic mode.
1908 1909 1910

        Args:
            img (paddle.Tensor | np.array | PIL.Image): Image to be erased.
1911
            scale (sequence, optional): The proportional range of the erased area to the input image.
1912 1913
            ratio (sequence, optional): Aspect ratio range of the erased area.
            value (sequence | None): The value each pixel in erased area will be replaced with.
1914
                               If value is a sequence with length 3, the R, G, B channels will be ereased
1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938
                               respectively. If value is None, each pixel will be erased with random values.

        Returns:
            tuple: params (i, j, h, w, v) to be passed to ``erase`` for random erase.
        """
        if F._is_pil_image(img):
            shape = np.asarray(img).astype(np.uint8).shape
            h, w, c = shape[-3], shape[-2], shape[-1]
        elif F._is_numpy_image(img):
            h, w, c = img.shape[-3], img.shape[-2], img.shape[-1]
        elif F._is_tensor_image(img):
            c, h, w = img.shape[-3], img.shape[-2], img.shape[-1]

        img_area = h * w
        log_ratio = np.log(ratio)
        for _ in range(10):
            erase_area = np.random.uniform(*scale) * img_area
            aspect_ratio = np.exp(np.random.uniform(*log_ratio))
            erase_h = int(round(np.sqrt(erase_area * aspect_ratio)))
            erase_w = int(round(np.sqrt(erase_area / aspect_ratio)))
            if erase_h >= h or erase_w >= w:
                continue
            if F._is_tensor_image(img):
                if value is None:
1939
                    v = paddle.normal(shape=[c, erase_h, erase_w]).astype(
1940 1941
                        img.dtype
                    )
1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955
                else:
                    v = paddle.to_tensor(value, dtype=img.dtype)[:, None, None]
            else:
                if value is None:
                    v = np.random.normal(size=[erase_h, erase_w, c]) * 255
                else:
                    v = np.array(value)[None, None, :]
            top = np.random.randint(0, h - erase_h + 1)
            left = np.random.randint(0, w - erase_w + 1)

            return top, left, erase_h, erase_w, v

        return 0, 0, h, w, img

1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046
    def _static_get_param(self, img, scale, ratio, value):
        """Get parameters for ``erase`` for a random erasing in static mode.

        Args:
            img (paddle.static.Variable): Image to be erased.
            scale (sequence, optional): The proportional range of the erased area to the input image.
            ratio (sequence, optional): Aspect ratio range of the erased area.
            value (sequence | None): The value each pixel in erased area will be replaced with.
                               If value is a sequence with length 3, the R, G, B channels will be ereased
                               respectively. If value is None, each pixel will be erased with random values.

        Returns:
            tuple: params (i, j, h, w, v) to be passed to ``erase`` for random erase.
        """

        c, h, w = img.shape[-3], img.shape[-2], img.shape[-1]

        img_area = h * w
        log_ratio = np.log(np.array(ratio))

        def cond(counter, ten, erase_h, erase_w):
            return counter < ten and (erase_h >= h or erase_w >= w)

        def body(counter, ten, erase_h, erase_w):
            erase_area = (
                paddle.uniform([1], min=scale[0], max=scale[1]) * img_area
            )
            aspect_ratio = paddle.exp(
                paddle.uniform([1], min=log_ratio[0], max=log_ratio[1])
            )
            erase_h = paddle.round(paddle.sqrt(erase_area * aspect_ratio)).cast(
                "int32"
            )
            erase_w = paddle.round(paddle.sqrt(erase_area / aspect_ratio)).cast(
                "int32"
            )

            counter += 1

            return [counter, ten, erase_h, erase_w]

        h = paddle.assign([h]).astype("int32")
        w = paddle.assign([w]).astype("int32")
        erase_h, erase_w = h.clone(), w.clone()
        counter = paddle.full(
            shape=[1], fill_value=0, dtype='int32'
        )  # loop counter
        ten = paddle.full(
            shape=[1], fill_value=10, dtype='int32'
        )  # loop length
        counter, ten, erase_h, erase_w = paddle.static.nn.while_loop(
            cond, body, [counter, ten, erase_h, erase_w]
        )

        if value is None:
            v = paddle.normal(shape=[c, erase_h, erase_w]).astype(img.dtype)
        else:
            v = value[:, None, None]

        zero = paddle.zeros([1]).astype("int32")
        top = paddle.static.nn.cond(
            erase_h < h and erase_w < w,
            lambda: paddle.uniform(
                shape=[1], min=0, max=h - erase_h + 1
            ).astype("int32"),
            lambda: zero,
        )

        left = paddle.static.nn.cond(
            erase_h < h and erase_w < w,
            lambda: paddle.uniform(
                shape=[1], min=0, max=w - erase_w + 1
            ).astype("int32"),
            lambda: zero,
        )

        erase_h = paddle.static.nn.cond(
            erase_h < h and erase_w < w, lambda: erase_h, lambda: h
        )

        erase_w = paddle.static.nn.cond(
            erase_h < h and erase_w < w, lambda: erase_w, lambda: w
        )

        v = paddle.static.nn.cond(
            erase_h < h and erase_w < w, lambda: v, lambda: img
        )

        return top, left, erase_h, erase_w, v, counter

    def _dynamic_apply_image(self, img):
2047 2048 2049 2050 2051
        """
        Args:
            img (paddle.Tensor | np.array | PIL.Image): Image to be Erased.

        Returns:
2052
            output (paddle.Tensor | np.array | PIL.Image): A random erased image.
2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065
        """

        if random.random() < self.prob:
            if isinstance(self.value, numbers.Number):
                value = [self.value]
            elif isinstance(self.value, str):
                value = None
            else:
                value = self.value
            if value is not None and not (len(value) == 1 or len(value) == 3):
                raise ValueError(
                    "Value should be a single number or a sequence with length equals to image's channel."
                )
2066
            top, left, erase_h, erase_w, v = self._dynamic_get_param(
2067 2068
                img, self.scale, self.ratio, value
            )
2069 2070
            return F.erase(img, top, left, erase_h, erase_w, v, self.inplace)
        return img
2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107

    def _static_apply_image(self, img):
        """
        Args:
            img (paddle.static.Variable): Image to be Erased.

        Returns:
            output (paddle.static.Variable): A random erased image.
        """

        if isinstance(self.value, numbers.Number):
            value = paddle.assign([self.value]).astype(img.dtype)
        elif isinstance(self.value, str):
            value = None
        else:
            value = paddle.assign(self.value).astype(img.dtype)
        if value is not None and not (
            value.shape[0] == 1 or value.shape[0] == 3
        ):
            raise ValueError(
                "Value should be a single number or a sequence with length equals to image's channel."
            )

        top, left, erase_h, erase_w, v, counter = self._static_get_param(
            img, self.scale, self.ratio, value
        )
        return F.erase(img, top, left, erase_h, erase_w, v, self.inplace)

    def _apply_image(self, img):
        if paddle.in_dynamic_mode():
            return self._dynamic_apply_image(img)
        else:
            return paddle.static.nn.cond(
                paddle.rand([1]) < self.prob,
                lambda: self._static_apply_image(img),
                lambda: img,
            )